From 578de0dbe3dde70665d534c03091214e8a1c17cb Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Wed, 23 Jun 2021 12:20:17 -0400 Subject: [PATCH 01/13] Uncomment ctest --- dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat index 12ff9b4b618..d41d60d1a1d 100644 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat @@ -79,13 +79,8 @@ cmake --build . --target INSTALL --config Release || exit /B 1 @rem Test Arrow C++ library -@rem TODO(ARROW-11675): Uncomment the below -@rem and troubleshoot two test failures: -@rem - TestStatisticsSortOrder/0.MinMax -@rem - TestStatistic.Int32Extremums - -@rem ctest --output-on-failure ^ -@rem --parallel %NUMBER_OF_PROCESSORS% ^ -@rem --timeout 300 || exit /B 1 +ctest --output-on-failure ^ + --parallel %NUMBER_OF_PROCESSORS% ^ + --timeout 300 || exit /B 1 popd From 6c909c8ff512674b5d7ecb5322e5dae7c53140da Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 10:11:36 +0200 Subject: [PATCH 02/13] Add debug output --- cpp/src/parquet/statistics_test.cc | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 8a275fd0936..8400b9afdde 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -702,10 +702,11 @@ class TestStatisticsSortOrder : public ::testing::Test { std::shared_ptr file_metadata = parquet_reader->metadata(); std::shared_ptr rg_metadata = file_metadata->RowGroup(0); for (int i = 0; i < static_cast(fields_.size()); i++) { + ARROW_SCOPED_TRACE("Statistics for field #", i); std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); - ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); - ASSERT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); + EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); } } @@ -934,11 +935,11 @@ template Update(values.data(), values.size(), 0); ASSERT_TRUE(stats->HasMinMax()); - ASSERT_EQ(stats->min(), expected_min); - ASSERT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->min(), expected_min); + EXPECT_EQ(stats->max(), expected_max); } -template +template void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitmap, T expected_min, T expected_max) { auto n_values = values.size(); @@ -946,8 +947,8 @@ void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitm auto non_null_count = n_values - null_count; stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count, null_count); ASSERT_TRUE(stats->HasMinMax()); - ASSERT_EQ(stats->min(), expected_min); - ASSERT_EQ(stats->max(), expected_max); + EXPECT_EQ(stats->min(), expected_min); + EXPECT_EQ(stats->max(), expected_max); } template @@ -969,10 +970,10 @@ template void CheckExtremums() { using UT = typename std::make_unsigned::type; - T smin = std::numeric_limits::min(); - T smax = std::numeric_limits::max(); - T umin = std::numeric_limits::min(); - T umax = std::numeric_limits::max(); + const T smin = std::numeric_limits::min(); + const T smax = std::numeric_limits::max(); + const T umin = static_cast(std::numeric_limits::min()); + const T umax = static_cast(std::numeric_limits::max()); constexpr int kNumValues = 8; std::array values{0, smin, smax, umin, @@ -987,9 +988,11 @@ void CheckExtremums() { LogicalType::Int(sizeof(T) * CHAR_BIT, true /*signed*/), ParquetType::type_num); ColumnDescriptor signed_descr(signed_node, 1, 1); + ARROW_SCOPED_TRACE("unsigned statistics: umin = ", umin, ", umax = ", umax); auto unsigned_stats = MakeStatistics(&unsigned_descr); AssertMinMaxAre(unsigned_stats, values, umin, umax); + ARROW_SCOPED_TRACE("signed statistics: smin = ", smin, ", smax = ", smax); auto signed_stats = MakeStatistics(&signed_descr); AssertMinMaxAre(signed_stats, values, smin, smax); } From e4c22c0a0f55174acb39af940bf1b755dcea302d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 11:02:36 +0200 Subject: [PATCH 03/13] Try possible fix --- cpp/src/parquet/statistics.cc | 18 ++++++++++-------- cpp/vcpkg.json | 22 +++------------------- dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat | 2 +- 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index bc474e99abf..bac91bec2f1 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -31,7 +31,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/optional.h" -#include "arrow/util/ubsan.h" #include "arrow/visitor_inline.h" #include "parquet/encoding.h" #include "parquet/exception.h" @@ -83,12 +82,16 @@ struct UnsignedCompareHelperBase { using T = typename DType::c_type; using UCType = typename std::make_unsigned::type; - constexpr static T DefaultMin() { return std::numeric_limits::max(); } - constexpr static T DefaultMax() { return std::numeric_limits::lowest(); } + constexpr static T DefaultMin() { + return static_cast(std::numeric_limits::max()); + } + constexpr static T DefaultMax() { + return static_cast(std::numeric_limits::lowest()); + } static T Coalesce(T val, T fallback) { return val; } static inline bool Compare(int type_length, T a, T b) { - return ::arrow::util::SafeCopy(a) < ::arrow::util::SafeCopy(b); + return static_cast(a) < static_cast(b); } static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; } @@ -107,12 +110,12 @@ struct CompareHelper { using msb_type = typename std::conditional::type; static T DefaultMin() { - uint32_t kMsbMax = std::numeric_limits::max(); + uint32_t kMsbMax = static_cast(std::numeric_limits::max()); uint32_t kMax = std::numeric_limits::max(); return {kMax, kMax, kMsbMax}; } static T DefaultMax() { - uint32_t kMsbMin = std::numeric_limits::min(); + uint32_t kMsbMin = static_cast(std::numeric_limits::min()); uint32_t kMin = std::numeric_limits::min(); return {kMin, kMin, kMsbMin}; } @@ -122,8 +125,7 @@ struct CompareHelper { if (a.value[2] != b.value[2]) { // Only the MSB bit is by Signed comparison. For little-endian, this is the // last bit of Int96 type. - return ::arrow::util::SafeCopy(a.value[2]) < - ::arrow::util::SafeCopy(b.value[2]); + return static_cast(a.value[2]) < static_cast(b.value[2]); } else if (a.value[1] != b.value[1]) { return (a.value[1] < b.value[1]); } diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index b724412d397..0b36dfe4b48 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -2,33 +2,17 @@ "name": "arrow", "version-string": "5.0.0-SNAPSHOT", "dependencies": [ - "abseil", - { - "name": "aws-sdk-cpp", - "features": [ - "config", - "cognito-identity", - "identity-management", - "s3", - "sts", - "transfer" - ] - }, "benchmark", - "boost", + "boost-filesystem", + "boost-multiprecision", + "boost-system", "brotli", "bzip2", - "c-ares", - "curl", "flatbuffers", "gflags", "glog", - "grpc", "gtest", "lz4", - "openssl", - "orc", - "protobuf", "rapidjson", "re2", "snappy", diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat index d41d60d1a1d..6423720c225 100644 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat @@ -60,7 +60,7 @@ cmake -G "Visual Studio 16 2019" -A x64 ^ -DARROW_CXXFLAGS="/MP" ^ -DARROW_DATASET=ON ^ -DARROW_DEPENDENCY_SOURCE=VCPKG ^ - -DARROW_FLIGHT=ON ^ + -DARROW_FLIGHT=OFF ^ -DARROW_MIMALLOC=ON ^ -DARROW_PARQUET=ON ^ -DARROW_PYTHON=OFF ^ From fc3f3d7b4b4ce5e5bb18b45c4251889515c23fb1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 13:57:11 +0200 Subject: [PATCH 04/13] Try caching binaries --- cpp/src/parquet/statistics.cc | 6 +++ cpp/src/parquet/statistics_test.cc | 23 +++++++--- dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat | 52 +++++++++++------------ dev/tasks/vcpkg-tests/github.windows.yml | 13 ++++++ 4 files changed, 61 insertions(+), 33 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index bac91bec2f1..8e6eb7bacb5 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -54,6 +54,9 @@ template struct CompareHelper { using T = typename DType::c_type; + static_assert(!std::is_unsigned::value || std::is_same::value, + "T is an unsigned numeric"); + constexpr static T DefaultMin() { return std::numeric_limits::max(); } constexpr static T DefaultMax() { return std::numeric_limits::lowest(); } @@ -82,6 +85,9 @@ struct UnsignedCompareHelperBase { using T = typename DType::c_type; using UCType = typename std::make_unsigned::type; + static_assert(!std::is_same::value, "T is unsigned"); + static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); + constexpr static T DefaultMin() { return static_cast(std::numeric_limits::max()); } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 8400b9afdde..b4a011a54a1 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -988,13 +988,22 @@ void CheckExtremums() { LogicalType::Int(sizeof(T) * CHAR_BIT, true /*signed*/), ParquetType::type_num); ColumnDescriptor signed_descr(signed_node, 1, 1); - ARROW_SCOPED_TRACE("unsigned statistics: umin = ", umin, ", umax = ", umax); - auto unsigned_stats = MakeStatistics(&unsigned_descr); - AssertMinMaxAre(unsigned_stats, values, umin, umax); - - ARROW_SCOPED_TRACE("signed statistics: smin = ", smin, ", smax = ", smax); - auto signed_stats = MakeStatistics(&signed_descr); - AssertMinMaxAre(signed_stats, values, smin, smax); + { + ARROW_SCOPED_TRACE("unsigned statistics: umin = ", umin, ", umax = ", umax, + ", node type = ", unsigned_node->logical_type()->ToString(), + ", physical type = ", unsigned_descr.physical_type(), + ", sort order = ", unsigned_descr.sort_order()); + auto unsigned_stats = MakeStatistics(&unsigned_descr); + AssertMinMaxAre(unsigned_stats, values, umin, umax); + } + { + ARROW_SCOPED_TRACE("signed statistics: smin = ", smin, ", smax = ", smax, + ", node type = ", signed_node->logical_type()->ToString(), + ", physical type = ", signed_descr.physical_type(), + ", sort order = ", signed_descr.sort_order()); + auto signed_stats = MakeStatistics(&signed_descr); + AssertMinMaxAre(signed_stats, values, smin, smax); + } } TEST(TestStatistic, Int32Extremums) { CheckExtremums(); } diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat index 6423720c225..a0c13518dfc 100644 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat @@ -52,35 +52,35 @@ pushd cpp\build @rem TODO(ianmcook): Add -DARROW_BUILD_BENCHMARKS=ON after the issue described @rem at https://github.com/google/benchmark/issues/1046 is resolved -cmake -G "Visual Studio 16 2019" -A x64 ^ - -DARROW_BOOST_USE_SHARED=ON ^ - -DARROW_BUILD_SHARED=ON ^ - -DARROW_BUILD_STATIC=OFF ^ - -DARROW_BUILD_TESTS=ON ^ - -DARROW_CXXFLAGS="/MP" ^ - -DARROW_DATASET=ON ^ - -DARROW_DEPENDENCY_SOURCE=VCPKG ^ - -DARROW_FLIGHT=OFF ^ - -DARROW_MIMALLOC=ON ^ - -DARROW_PARQUET=ON ^ - -DARROW_PYTHON=OFF ^ - -DARROW_WITH_BROTLI=ON ^ - -DARROW_WITH_BZ2=ON ^ - -DARROW_WITH_LZ4=ON ^ - -DARROW_WITH_SNAPPY=ON ^ - -DARROW_WITH_ZLIB=ON ^ - -DARROW_WITH_ZSTD=ON ^ - -DCMAKE_BUILD_TYPE=release ^ - -DCMAKE_UNITY_BUILD=ON ^ - .. || exit /B 1 - -cmake --build . --target INSTALL --config Release || exit /B 1 +rem cmake -G "Visual Studio 16 2019" -A x64 ^ +rem -DARROW_BOOST_USE_SHARED=ON ^ +rem -DARROW_BUILD_SHARED=ON ^ +rem -DARROW_BUILD_STATIC=OFF ^ +rem -DARROW_BUILD_TESTS=ON ^ +rem -DARROW_CXXFLAGS="/MP" ^ +rem -DARROW_DATASET=ON ^ +rem -DARROW_DEPENDENCY_SOURCE=VCPKG ^ +rem -DARROW_FLIGHT=OFF ^ +rem -DARROW_MIMALLOC=ON ^ +rem -DARROW_PARQUET=ON ^ +rem -DARROW_PYTHON=OFF ^ +rem -DARROW_WITH_BROTLI=ON ^ +rem -DARROW_WITH_BZ2=ON ^ +rem -DARROW_WITH_LZ4=ON ^ +rem -DARROW_WITH_SNAPPY=ON ^ +rem -DARROW_WITH_ZLIB=ON ^ +rem -DARROW_WITH_ZSTD=ON ^ +rem -DCMAKE_BUILD_TYPE=release ^ +rem -DCMAKE_UNITY_BUILD=ON ^ +rem .. || exit /B 1 +rem +rem cmake --build . --target INSTALL --config Release || exit /B 1 @rem Test Arrow C++ library -ctest --output-on-failure ^ - --parallel %NUMBER_OF_PROCESSORS% ^ - --timeout 300 || exit /B 1 +rem ctest --output-on-failure ^ +rem --parallel %NUMBER_OF_PROCESSORS% ^ +rem --timeout 300 || exit /B 1 popd diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index eacb6317c30..9caf262b046 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -36,6 +36,7 @@ jobs: git -C arrow checkout FETCH_HEAD git -C arrow submodule update --init --recursive - name: Remove and Reinstall vcpkg + # TODO: this comment seems outdated, the vcpkg on GHA is quite recent. # As of January 2021, the version of vcpkg that is preinstalled on the # Github Actions windows-2019 image is 2020.11.12, as noted at # https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md @@ -56,6 +57,18 @@ jobs: CALL bootstrap-vcpkg.bat -win64 -disableMetrics CALL vcpkg integrate install CALL setx PATH "%PATH%;C:\vcpkg" + - name: vcpkg cache info + id: vcpkg-cache-info + shell: bash + run: | + echo "::set-output name=cache-dir::$LOCALAPPDATA/vcpkg/archives" + - name: Cache vcpkg binaries + uses: actions/cache@v2 + with: + {% raw %} + path: ${{ steps.vcpkg-cache-info.outputs.cache-dir }} + key: ${{ github.job }} + {% endraw %} - name: Install Dependencies with vcpkg and Build Arrow C++ shell: cmd run: | From da0ce4d7ce161a5a65612c78614185b3fb9ddd27 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 16:01:32 +0200 Subject: [PATCH 05/13] Re-enable building --- dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat | 52 +++++++++++------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat index a0c13518dfc..6423720c225 100644 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat @@ -52,35 +52,35 @@ pushd cpp\build @rem TODO(ianmcook): Add -DARROW_BUILD_BENCHMARKS=ON after the issue described @rem at https://github.com/google/benchmark/issues/1046 is resolved -rem cmake -G "Visual Studio 16 2019" -A x64 ^ -rem -DARROW_BOOST_USE_SHARED=ON ^ -rem -DARROW_BUILD_SHARED=ON ^ -rem -DARROW_BUILD_STATIC=OFF ^ -rem -DARROW_BUILD_TESTS=ON ^ -rem -DARROW_CXXFLAGS="/MP" ^ -rem -DARROW_DATASET=ON ^ -rem -DARROW_DEPENDENCY_SOURCE=VCPKG ^ -rem -DARROW_FLIGHT=OFF ^ -rem -DARROW_MIMALLOC=ON ^ -rem -DARROW_PARQUET=ON ^ -rem -DARROW_PYTHON=OFF ^ -rem -DARROW_WITH_BROTLI=ON ^ -rem -DARROW_WITH_BZ2=ON ^ -rem -DARROW_WITH_LZ4=ON ^ -rem -DARROW_WITH_SNAPPY=ON ^ -rem -DARROW_WITH_ZLIB=ON ^ -rem -DARROW_WITH_ZSTD=ON ^ -rem -DCMAKE_BUILD_TYPE=release ^ -rem -DCMAKE_UNITY_BUILD=ON ^ -rem .. || exit /B 1 -rem -rem cmake --build . --target INSTALL --config Release || exit /B 1 +cmake -G "Visual Studio 16 2019" -A x64 ^ + -DARROW_BOOST_USE_SHARED=ON ^ + -DARROW_BUILD_SHARED=ON ^ + -DARROW_BUILD_STATIC=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_CXXFLAGS="/MP" ^ + -DARROW_DATASET=ON ^ + -DARROW_DEPENDENCY_SOURCE=VCPKG ^ + -DARROW_FLIGHT=OFF ^ + -DARROW_MIMALLOC=ON ^ + -DARROW_PARQUET=ON ^ + -DARROW_PYTHON=OFF ^ + -DARROW_WITH_BROTLI=ON ^ + -DARROW_WITH_BZ2=ON ^ + -DARROW_WITH_LZ4=ON ^ + -DARROW_WITH_SNAPPY=ON ^ + -DARROW_WITH_ZLIB=ON ^ + -DARROW_WITH_ZSTD=ON ^ + -DCMAKE_BUILD_TYPE=release ^ + -DCMAKE_UNITY_BUILD=ON ^ + .. || exit /B 1 + +cmake --build . --target INSTALL --config Release || exit /B 1 @rem Test Arrow C++ library -rem ctest --output-on-failure ^ -rem --parallel %NUMBER_OF_PROCESSORS% ^ -rem --timeout 300 || exit /B 1 +ctest --output-on-failure ^ + --parallel %NUMBER_OF_PROCESSORS% ^ + --timeout 300 || exit /B 1 popd From 4cca568c8eac9e5a3131f65835d36a3cbda0b5fb Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 17:26:24 +0200 Subject: [PATCH 06/13] More debug info --- cpp/src/parquet/statistics.cc | 22 ++++++++++++++++++---- cpp/src/parquet/statistics_test.cc | 6 +++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 8e6eb7bacb5..fd0b14a5822 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -88,20 +88,34 @@ struct UnsignedCompareHelperBase { static_assert(!std::is_same::value, "T is unsigned"); static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); - constexpr static T DefaultMin() { + static const T DefaultMin() { + ARROW_LOG(INFO) << "DefaultMin = " + << static_cast(std::numeric_limits::max()); return static_cast(std::numeric_limits::max()); } - constexpr static T DefaultMax() { + static const T DefaultMax() { + ARROW_LOG(INFO) << "DefaultMax = " + << static_cast(std::numeric_limits::lowest()); return static_cast(std::numeric_limits::lowest()); } static T Coalesce(T val, T fallback) { return val; } static inline bool Compare(int type_length, T a, T b) { + ARROW_LOG(INFO) << "UCHBase::Compare(" << a << ", " << b << ") -> " + << (static_cast(a) < static_cast(b)); return static_cast(a) < static_cast(b); } - static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; } - static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; } + static T Min(int type_length, T a, T b) { + const auto res = Compare(type_length, a, b) ? a : b; + ARROW_LOG(INFO) << "UCHBase::Min(" << a << ", " << b << ") -> " << res; + return res; + } + static T Max(int type_length, T a, T b) { + const auto res = Compare(type_length, a, b) ? b : a; + ARROW_LOG(INFO) << "UCHBase::Max(" << a << ", " << b << ") -> " << res; + return res; + } }; template <> diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index b4a011a54a1..4157a810b59 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -967,7 +967,7 @@ void AssertUnsetMinMax(Stats stats, const Array& values, const uint8_t* valid_bi } template -void CheckExtremums() { +void CheckExtrema() { using UT = typename std::make_unsigned::type; const T smin = std::numeric_limits::min(); @@ -1006,8 +1006,8 @@ void CheckExtremums() { } } -TEST(TestStatistic, Int32Extremums) { CheckExtremums(); } -TEST(TestStatistic, Int64Extremums) { CheckExtremums(); } +TEST(TestStatistic, Int32Extrema) { CheckExtrema(); } +TEST(TestStatistic, Int64Extrema) { CheckExtrema(); } // PARQUET-1225: Float NaN values may lead to incorrect min-max template From 899c5b8180ed500da20c05f087391b696fae455c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 19:53:43 +0200 Subject: [PATCH 07/13] Remove debug log --- cpp/src/parquet/statistics.cc | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index fd0b14a5822..ab71e18f101 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -89,32 +89,22 @@ struct UnsignedCompareHelperBase { static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); static const T DefaultMin() { - ARROW_LOG(INFO) << "DefaultMin = " - << static_cast(std::numeric_limits::max()); return static_cast(std::numeric_limits::max()); } static const T DefaultMax() { - ARROW_LOG(INFO) << "DefaultMax = " - << static_cast(std::numeric_limits::lowest()); return static_cast(std::numeric_limits::lowest()); } static T Coalesce(T val, T fallback) { return val; } static inline bool Compare(int type_length, T a, T b) { - ARROW_LOG(INFO) << "UCHBase::Compare(" << a << ", " << b << ") -> " - << (static_cast(a) < static_cast(b)); return static_cast(a) < static_cast(b); } static T Min(int type_length, T a, T b) { - const auto res = Compare(type_length, a, b) ? a : b; - ARROW_LOG(INFO) << "UCHBase::Min(" << a << ", " << b << ") -> " << res; - return res; + return Compare(type_length, a, b) ? a : b; } static T Max(int type_length, T a, T b) { - const auto res = Compare(type_length, a, b) ? b : a; - ARROW_LOG(INFO) << "UCHBase::Max(" << a << ", " << b << ") -> " << res; - return res; + return Compare(type_length, a, b) ? b : a; } }; From 07929c29675d416cc26e6e6f2dce74b610f66de7 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 21:14:49 +0200 Subject: [PATCH 08/13] Try SafeCopy --- cpp/src/parquet/statistics.cc | 26 +++++++++++--------------- cpp/src/parquet/statistics_test.cc | 6 ++++-- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index ab71e18f101..3b1d86b198f 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -31,6 +31,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/optional.h" +#include "arrow/util/ubsan.h" #include "arrow/visitor_inline.h" #include "parquet/encoding.h" #include "parquet/exception.h" @@ -40,6 +41,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; using arrow::internal::checked_cast; +using arrow::util::SafeCopy; namespace parquet { namespace { @@ -88,24 +90,18 @@ struct UnsignedCompareHelperBase { static_assert(!std::is_same::value, "T is unsigned"); static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); - static const T DefaultMin() { - return static_cast(std::numeric_limits::max()); - } + static const T DefaultMin() { return SafeCopy(std::numeric_limits::max()); } static const T DefaultMax() { - return static_cast(std::numeric_limits::lowest()); + return SafeCopy(std::numeric_limits::lowest()); } static T Coalesce(T val, T fallback) { return val; } - static inline bool Compare(int type_length, T a, T b) { - return static_cast(a) < static_cast(b); + static bool Compare(int type_length, T a, T b) { + return SafeCopy(a) < SafeCopy(b); } - static T Min(int type_length, T a, T b) { - return Compare(type_length, a, b) ? a : b; - } - static T Max(int type_length, T a, T b) { - return Compare(type_length, a, b) ? b : a; - } + static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; } + static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; } }; template <> @@ -120,12 +116,12 @@ struct CompareHelper { using msb_type = typename std::conditional::type; static T DefaultMin() { - uint32_t kMsbMax = static_cast(std::numeric_limits::max()); + uint32_t kMsbMax = SafeCopy(std::numeric_limits::max()); uint32_t kMax = std::numeric_limits::max(); return {kMax, kMax, kMsbMax}; } static T DefaultMax() { - uint32_t kMsbMin = static_cast(std::numeric_limits::min()); + uint32_t kMsbMin = SafeCopy(std::numeric_limits::min()); uint32_t kMin = std::numeric_limits::min(); return {kMin, kMin, kMsbMin}; } @@ -135,7 +131,7 @@ struct CompareHelper { if (a.value[2] != b.value[2]) { // Only the MSB bit is by Signed comparison. For little-endian, this is the // last bit of Int96 type. - return static_cast(a.value[2]) < static_cast(b.value[2]); + return SafeCopy(a.value[2]) < SafeCopy(b.value[2]); } else if (a.value[1] != b.value[1]) { return (a.value[1] < b.value[1]); } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 4157a810b59..9eea743e440 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -30,6 +30,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/ubsan.h" #include "parquet/column_reader.h" #include "parquet/column_writer.h" @@ -44,6 +45,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +using arrow::util::SafeCopy; namespace BitUtil = arrow::BitUtil; @@ -972,8 +974,8 @@ void CheckExtrema() { const T smin = std::numeric_limits::min(); const T smax = std::numeric_limits::max(); - const T umin = static_cast(std::numeric_limits::min()); - const T umax = static_cast(std::numeric_limits::max()); + const T umin = SafeCopy(std::numeric_limits::min()); + const T umax = SafeCopy(std::numeric_limits::max()); constexpr int kNumValues = 8; std::array values{0, smin, smax, umin, From 77739832fe1e9a8dde34f7249f90e0fabe639937 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 22:19:35 +0200 Subject: [PATCH 09/13] Add unsigned Int32 specialization --- cpp/src/parquet/statistics.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 3b1d86b198f..9d92bd11b32 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -382,6 +382,25 @@ class TypedComparatorImpl : virtual public TypedComparator { int type_length_; }; +template <> +std::pair +TypedComparatorImpl::GetMinMax(const int32_t* values, + int64_t length) { + DCHECK_GT(length, 0); + + const uint32_t* unsigned_values = reinterpret_cast(values); + uint32_t min = std::numeric_limits::max(); + uint32_t max = std::numeric_limits::lowest(); + + for (int64_t i = 0; i < length; i++) { + const auto val = unsigned_values[i]; + min = std::min(min, val); + max = std::max(max, val); + } + + return {SafeCopy(min), SafeCopy(max)}; +} + template std::pair TypedComparatorImpl::GetMinMax(const ::arrow::Array& values) { From 4ab145d835b581636f6aa684252362d9e046bc62 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 28 Jun 2021 15:54:25 +0200 Subject: [PATCH 10/13] Add spaced tests --- cpp/src/parquet/statistics_test.cc | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 9eea743e440..dbd7d98b238 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -979,7 +979,7 @@ void CheckExtrema() { constexpr int kNumValues = 8; std::array values{0, smin, smax, umin, - umax, smin + 1, smax - 1, umin - 1}; + umax, smin + 1, smax - 1, umax - 1}; NodePtr unsigned_node = PrimitiveNode::Make( "uint", Repetition::OPTIONAL, @@ -1006,6 +1006,27 @@ void CheckExtrema() { auto signed_stats = MakeStatistics(&signed_descr); AssertMinMaxAre(signed_stats, values, smin, smax); } + + // With validity bitmap + std::vector is_valid = {true, false, false, false, false, true, true, true}; + std::shared_ptr valid_bitmap; + ::arrow::BitmapFromVector(is_valid, &valid_bitmap); + { + ARROW_SCOPED_TRACE("spaced unsigned statistics: umin = ", umin, ", umax = ", umax, + ", node type = ", unsigned_node->logical_type()->ToString(), + ", physical type = ", unsigned_descr.physical_type(), + ", sort order = ", unsigned_descr.sort_order()); + auto unsigned_stats = MakeStatistics(&unsigned_descr); + AssertMinMaxAre(unsigned_stats, values, valid_bitmap->data(), T{0}, umax - 1); + } + { + ARROW_SCOPED_TRACE("spaced signed statistics: smin = ", smin, ", smax = ", smax, + ", node type = ", signed_node->logical_type()->ToString(), + ", physical type = ", signed_descr.physical_type(), + ", sort order = ", signed_descr.sort_order()); + auto signed_stats = MakeStatistics(&signed_descr); + AssertMinMaxAre(signed_stats, values, valid_bitmap->data(), smin + 1, smax - 1); + } } TEST(TestStatistic, Int32Extrema) { CheckExtrema(); } From 9b3a20b8e76228385d48d0619c9029e1df8894ee Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 28 Jun 2021 17:30:19 +0200 Subject: [PATCH 11/13] Cleanups --- cpp/src/parquet/statistics.cc | 11 ++++++++ cpp/vcpkg.json | 18 +++++++++++++ dev/tasks/vcpkg-tests/github.windows.yml | 34 ------------------------ 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 9d92bd11b32..7d22124c165 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -90,6 +90,14 @@ struct UnsignedCompareHelperBase { static_assert(!std::is_same::value, "T is unsigned"); static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); + // NOTE: according to the C++ spec, unsigned-to-signed conversion is + // implementation-defined if the original value does not fix in the signed type + // (i.e., two's complement cannot be assumed even on mainstream machines, + // because the compiler may decide otherwise). Hence the use of `SafeCopy` + // below for deterministic bit-casting. + // (see "Integer conversions" in + // https://en.cppreference.com/w/cpp/language/implicit_conversion) + static const T DefaultMin() { return SafeCopy(std::numeric_limits::max()); } static const T DefaultMax() { return SafeCopy(std::numeric_limits::lowest()); @@ -382,6 +390,9 @@ class TypedComparatorImpl : virtual public TypedComparator { int type_length_; }; +// ARROW-11675: A hand-written version of GetMinMax(), to work around +// what looks like a MSVC code generation bug. +// This does not seem to be required for GetMinMaxSpaced(). template <> std::pair TypedComparatorImpl::GetMinMax(const int32_t* values, diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index 0b36dfe4b48..5f92affa4eb 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -2,17 +2,35 @@ "name": "arrow", "version-string": "5.0.0-SNAPSHOT", "dependencies": [ + "abseil", + { + "name": "aws-sdk-cpp", + "features": [ + "config", + "cognito-identity", + "identity-management", + "s3", + "sts", + "transfer" + ] + }, "benchmark", "boost-filesystem", "boost-multiprecision", "boost-system", "brotli", "bzip2", + "c-ares", + "curl", "flatbuffers", "gflags", "glog", + "grpc", "gtest", "lz4", + "openssl", + "orc", + "protobuf", "rapidjson", "re2", "snappy", diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index 9caf262b046..240f4c94e01 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -35,40 +35,6 @@ jobs: git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} git -C arrow checkout FETCH_HEAD git -C arrow submodule update --init --recursive - - name: Remove and Reinstall vcpkg - # TODO: this comment seems outdated, the vcpkg on GHA is quite recent. - # As of January 2021, the version of vcpkg that is preinstalled on the - # Github Actions windows-2019 image is 2020.11.12, as noted at - # https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md - # This version of vcpkg has a bug that causes the installation of - # aws-cpp-sdk to fail. See details at - # https://github.com/awslabs/aws-c-common/issues/734 - # and https://github.com/microsoft/vcpkg/pull/14716. - # When running vcpkg in Github Actions on Windows, remove the - # preinstalled vcpkg and install the newest version from source. - shell: cmd - run: | - CALL vcpkg integrate remove 2>NUL - CALL C: - CALL cd \ - CALL rmdir /s /q vcpkg 2>NUL - CALL git clone https://github.com/microsoft/vcpkg.git vcpkg - CALL cd vcpkg - CALL bootstrap-vcpkg.bat -win64 -disableMetrics - CALL vcpkg integrate install - CALL setx PATH "%PATH%;C:\vcpkg" - - name: vcpkg cache info - id: vcpkg-cache-info - shell: bash - run: | - echo "::set-output name=cache-dir::$LOCALAPPDATA/vcpkg/archives" - - name: Cache vcpkg binaries - uses: actions/cache@v2 - with: - {% raw %} - path: ${{ steps.vcpkg-cache-info.outputs.cache-dir }} - key: ${{ github.job }} - {% endraw %} - name: Install Dependencies with vcpkg and Build Arrow C++ shell: cmd run: | From a581c295525b564293a7e8b50aad003fd2aacbd5 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 28 Jun 2021 17:38:35 +0200 Subject: [PATCH 12/13] Restore vcpkg reinstall --- dev/tasks/vcpkg-tests/github.windows.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index 240f4c94e01..8546d4307d1 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -35,6 +35,28 @@ jobs: git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} git -C arrow checkout FETCH_HEAD git -C arrow submodule update --init --recursive + - name: Remove and Reinstall vcpkg + # TODO: this comment seems outdated, the vcpkg on GHA is quite recent. + # As of January 2021, the version of vcpkg that is preinstalled on the + # Github Actions windows-2019 image is 2020.11.12, as noted at + # https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md + # This version of vcpkg has a bug that causes the installation of + # aws-cpp-sdk to fail. See details at + # https://github.com/awslabs/aws-c-common/issues/734 + # and https://github.com/microsoft/vcpkg/pull/14716. + # When running vcpkg in Github Actions on Windows, remove the + # preinstalled vcpkg and install the newest version from source. + shell: cmd + run: | + CALL vcpkg integrate remove 2>NUL + CALL C: + CALL cd \ + CALL rmdir /s /q vcpkg 2>NUL + CALL git clone https://github.com/microsoft/vcpkg.git vcpkg + CALL cd vcpkg + CALL bootstrap-vcpkg.bat -win64 -disableMetrics + CALL vcpkg integrate install + CALL setx PATH "%PATH%;C:\vcpkg" - name: Install Dependencies with vcpkg and Build Arrow C++ shell: cmd run: | From 0f6c823418a3b41ad5063dfbf8fa4ff1992e1047 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 29 Jun 2021 15:05:12 +0200 Subject: [PATCH 13/13] Address review comments --- cpp/src/parquet/statistics.cc | 7 +++---- dev/tasks/vcpkg-tests/github.windows.yml | 11 +++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 7d22124c165..72341590e75 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -91,7 +91,7 @@ struct UnsignedCompareHelperBase { static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size"); // NOTE: according to the C++ spec, unsigned-to-signed conversion is - // implementation-defined if the original value does not fix in the signed type + // implementation-defined if the original value does not fit in the signed type // (i.e., two's complement cannot be assumed even on mainstream machines, // because the compiler may decide otherwise). Hence the use of `SafeCopy` // below for deterministic bit-casting. @@ -99,9 +99,8 @@ struct UnsignedCompareHelperBase { // https://en.cppreference.com/w/cpp/language/implicit_conversion) static const T DefaultMin() { return SafeCopy(std::numeric_limits::max()); } - static const T DefaultMax() { - return SafeCopy(std::numeric_limits::lowest()); - } + static const T DefaultMax() { return 0; } + static T Coalesce(T val, T fallback) { return val; } static bool Compare(int type_length, T a, T b) { diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index 8546d4307d1..ad3e793a6c3 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -36,16 +36,11 @@ jobs: git -C arrow checkout FETCH_HEAD git -C arrow submodule update --init --recursive - name: Remove and Reinstall vcpkg - # TODO: this comment seems outdated, the vcpkg on GHA is quite recent. - # As of January 2021, the version of vcpkg that is preinstalled on the - # Github Actions windows-2019 image is 2020.11.12, as noted at - # https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md - # This version of vcpkg has a bug that causes the installation of - # aws-cpp-sdk to fail. See details at - # https://github.com/awslabs/aws-c-common/issues/734 - # and https://github.com/microsoft/vcpkg/pull/14716. # When running vcpkg in Github Actions on Windows, remove the # preinstalled vcpkg and install the newest version from source. + # Versions of vcpkg rapidly stop working until updated, and + # the safest and most reliable way to update vcpkg is simply + # to remove and reinstall it. shell: cmd run: | CALL vcpkg integrate remove 2>NUL