diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index bc474e99abf..72341590e75 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -41,6 +41,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; using arrow::internal::checked_cast; +using arrow::util::SafeCopy; namespace parquet { namespace { @@ -55,6 +56,9 @@ template struct CompareHelper { using T = typename DType::c_type; + static_assert(!std::is_unsigned::value || std::is_same::value, + "T is an unsigned numeric"); + constexpr static T DefaultMin() { return std::numeric_limits::max(); } constexpr static T DefaultMax() { return std::numeric_limits::lowest(); } @@ -83,12 +87,24 @@ struct UnsignedCompareHelperBase { using T = typename DType::c_type; using UCType = typename std::make_unsigned::type; - constexpr static T DefaultMin() { return std::numeric_limits::max(); } - constexpr static T DefaultMax() { return std::numeric_limits::lowest(); } + static_assert(!std::is_same::value, "T is unsigned"); + static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); + + // NOTE: according to the C++ spec, unsigned-to-signed conversion is + // implementation-defined if the original value does not fit in the signed type + // (i.e., two's complement cannot be assumed even on mainstream machines, + // because the compiler may decide otherwise). Hence the use of `SafeCopy` + // below for deterministic bit-casting. + // (see "Integer conversions" in + // https://en.cppreference.com/w/cpp/language/implicit_conversion) + + static const T DefaultMin() { return SafeCopy(std::numeric_limits::max()); } + static const T DefaultMax() { return 0; } + static T Coalesce(T val, T fallback) { return val; } - static inline bool Compare(int type_length, T a, T b) { - return ::arrow::util::SafeCopy(a) < ::arrow::util::SafeCopy(b); + static bool Compare(int type_length, T a, T b) { + return SafeCopy(a) < SafeCopy(b); } static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; } @@ -107,12 +123,12 @@ struct CompareHelper { using msb_type = typename std::conditional::type; static T DefaultMin() { - uint32_t kMsbMax = std::numeric_limits::max(); + uint32_t kMsbMax = SafeCopy(std::numeric_limits::max()); uint32_t kMax = std::numeric_limits::max(); return {kMax, kMax, kMsbMax}; } static T DefaultMax() { - uint32_t kMsbMin = std::numeric_limits::min(); + uint32_t kMsbMin = SafeCopy(std::numeric_limits::min()); uint32_t kMin = std::numeric_limits::min(); return {kMin, kMin, kMsbMin}; } @@ -122,8 +138,7 @@ struct CompareHelper { if (a.value[2] != b.value[2]) { // Only the MSB bit is by Signed comparison. For little-endian, this is the // last bit of Int96 type. - return ::arrow::util::SafeCopy(a.value[2]) < - ::arrow::util::SafeCopy(b.value[2]); + return SafeCopy(a.value[2]) < SafeCopy(b.value[2]); } else if (a.value[1] != b.value[1]) { return (a.value[1] < b.value[1]); } @@ -374,6 +389,28 @@ class TypedComparatorImpl : virtual public TypedComparator { int type_length_; }; +// ARROW-11675: A hand-written version of GetMinMax(), to work around +// what looks like a MSVC code generation bug. +// This does not seem to be required for GetMinMaxSpaced(). +template <> +std::pair +TypedComparatorImpl::GetMinMax(const int32_t* values, + int64_t length) { + DCHECK_GT(length, 0); + + const uint32_t* unsigned_values = reinterpret_cast(values); + uint32_t min = std::numeric_limits::max(); + uint32_t max = std::numeric_limits::lowest(); + + for (int64_t i = 0; i < length; i++) { + const auto val = unsigned_values[i]; + min = std::min(min, val); + max = std::max(max, val); + } + + return {SafeCopy(min), SafeCopy(max)}; +} + template std::pair TypedComparatorImpl::GetMinMax(const ::arrow::Array& values) { diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 8a275fd0936..dbd7d98b238 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -30,6 +30,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/ubsan.h" #include "parquet/column_reader.h" #include "parquet/column_writer.h" @@ -44,6 +45,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +using arrow::util::SafeCopy; namespace BitUtil = arrow::BitUtil; @@ -702,10 +704,11 @@ class TestStatisticsSortOrder : public ::testing::Test { std::shared_ptr file_metadata = parquet_reader->metadata(); std::shared_ptr rg_metadata = file_metadata->RowGroup(0); for (int i = 0; i < static_cast(fields_.size()); i++) { + ARROW_SCOPED_TRACE("Statistics for field #", i); std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); - ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); - ASSERT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); + EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); } } @@ -934,11 +937,11 @@ template Update(values.data(), values.size(), 0); ASSERT_TRUE(stats->HasMinMax()); - ASSERT_EQ(stats->min(), expected_min); - ASSERT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->min(), expected_min); + EXPECT_EQ(stats->max(), expected_max); } -template +template void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitmap, T expected_min, T expected_max) { auto n_values = values.size(); @@ -946,8 +949,8 @@ void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitm auto non_null_count = n_values - null_count; stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count, null_count); ASSERT_TRUE(stats->HasMinMax()); - ASSERT_EQ(stats->min(), expected_min); - ASSERT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->min(), expected_min); + EXPECT_EQ(stats->max(), expected_max); } template @@ -966,17 +969,17 @@ void AssertUnsetMinMax(Stats stats, const Array& values, const uint8_t* valid_bi } template -void CheckExtremums() { +void CheckExtrema() { using UT = typename std::make_unsigned::type; - T smin = std::numeric_limits::min(); - T smax = std::numeric_limits::max(); - T umin = std::numeric_limits::min(); - T umax = std::numeric_limits::max(); + const T smin = std::numeric_limits::min(); + const T smax = std::numeric_limits::max(); + const T umin = SafeCopy(std::numeric_limits::min()); + const T umax = SafeCopy(std::numeric_limits::max()); constexpr int kNumValues = 8; std::array values{0, smin, smax, umin, - umax, smin + 1, smax - 1, umin - 1}; + umax, smin + 1, smax - 1, umax - 1}; NodePtr unsigned_node = PrimitiveNode::Make( "uint", Repetition::OPTIONAL, @@ -987,15 +990,47 @@ void CheckExtremums() { LogicalType::Int(sizeof(T) * CHAR_BIT, true /*signed*/), ParquetType::type_num); ColumnDescriptor signed_descr(signed_node, 1, 1); - auto unsigned_stats = MakeStatistics(&unsigned_descr); - AssertMinMaxAre(unsigned_stats, values, umin, umax); + { + ARROW_SCOPED_TRACE("unsigned statistics: umin = ", umin, ", umax = ", umax, + ", node type = ", unsigned_node->logical_type()->ToString(), + ", physical type = ", unsigned_descr.physical_type(), + ", sort order = ", unsigned_descr.sort_order()); + auto unsigned_stats = MakeStatistics(&unsigned_descr); + AssertMinMaxAre(unsigned_stats, values, umin, umax); + } + { + ARROW_SCOPED_TRACE("signed statistics: smin = ", smin, ", smax = ", smax, + ", node type = ", signed_node->logical_type()->ToString(), + ", physical type = ", signed_descr.physical_type(), + ", sort order = ", signed_descr.sort_order()); + auto signed_stats = MakeStatistics(&signed_descr); + AssertMinMaxAre(signed_stats, values, smin, smax); + } - auto signed_stats = MakeStatistics(&signed_descr); - AssertMinMaxAre(signed_stats, values, smin, smax); + // With validity bitmap + std::vector is_valid = {true, false, false, false, false, true, true, true}; + std::shared_ptr valid_bitmap; + ::arrow::BitmapFromVector(is_valid, &valid_bitmap); + { + ARROW_SCOPED_TRACE("spaced unsigned statistics: umin = ", umin, ", umax = ", umax, + ", node type = ", unsigned_node->logical_type()->ToString(), + ", physical type = ", unsigned_descr.physical_type(), + ", sort order = ", unsigned_descr.sort_order()); + auto unsigned_stats = MakeStatistics(&unsigned_descr); + AssertMinMaxAre(unsigned_stats, values, valid_bitmap->data(), T{0}, umax - 1); + } + { + ARROW_SCOPED_TRACE("spaced signed statistics: smin = ", smin, ", smax = ", smax, + ", node type = ", signed_node->logical_type()->ToString(), + ", physical type = ", signed_descr.physical_type(), + ", sort order = ", signed_descr.sort_order()); + auto signed_stats = MakeStatistics(&signed_descr); + AssertMinMaxAre(signed_stats, values, valid_bitmap->data(), smin + 1, smax - 1); + } } -TEST(TestStatistic, Int32Extremums) { CheckExtremums(); } -TEST(TestStatistic, Int64Extremums) { CheckExtremums(); } +TEST(TestStatistic, Int32Extrema) { CheckExtrema(); } +TEST(TestStatistic, Int64Extrema) { CheckExtrema(); } // PARQUET-1225: Float NaN values may lead to incorrect min-max template diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index b724412d397..5f92affa4eb 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -15,7 +15,9 @@ ] }, "benchmark", - "boost", + "boost-filesystem", + "boost-multiprecision", + "boost-system", "brotli", "bzip2", "c-ares", diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat index 12ff9b4b618..6423720c225 100644 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat @@ -60,7 +60,7 @@ cmake -G "Visual Studio 16 2019" -A x64 ^ -DARROW_CXXFLAGS="/MP" ^ -DARROW_DATASET=ON ^ -DARROW_DEPENDENCY_SOURCE=VCPKG ^ - -DARROW_FLIGHT=ON ^ + -DARROW_FLIGHT=OFF ^ -DARROW_MIMALLOC=ON ^ -DARROW_PARQUET=ON ^ -DARROW_PYTHON=OFF ^ @@ -79,13 +79,8 @@ cmake --build . --target INSTALL --config Release || exit /B 1 @rem Test Arrow C++ library -@rem TODO(ARROW-11675): Uncomment the below -@rem and troubleshoot two test failures: -@rem - TestStatisticsSortOrder/0.MinMax -@rem - TestStatistic.Int32Extremums - -@rem ctest --output-on-failure ^ -@rem --parallel %NUMBER_OF_PROCESSORS% ^ -@rem --timeout 300 || exit /B 1 +ctest --output-on-failure ^ + --parallel %NUMBER_OF_PROCESSORS% ^ + --timeout 300 || exit /B 1 popd diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index eacb6317c30..ad3e793a6c3 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -36,15 +36,11 @@ jobs: git -C arrow checkout FETCH_HEAD git -C arrow submodule update --init --recursive - name: Remove and Reinstall vcpkg - # As of January 2021, the version of vcpkg that is preinstalled on the - # Github Actions windows-2019 image is 2020.11.12, as noted at - # https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md - # This version of vcpkg has a bug that causes the installation of - # aws-cpp-sdk to fail. See details at - # https://github.com/awslabs/aws-c-common/issues/734 - # and https://github.com/microsoft/vcpkg/pull/14716. # When running vcpkg in Github Actions on Windows, remove the # preinstalled vcpkg and install the newest version from source. + # Versions of vcpkg rapidly stop working until updated, and + # the safest and most reliable way to update vcpkg is simply + # to remove and reinstall it. shell: cmd run: | CALL vcpkg integrate remove 2>NUL