Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 45 additions & 8 deletions cpp/src/parquet/statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
using arrow::default_memory_pool;
using arrow::MemoryPool;
using arrow::internal::checked_cast;
using arrow::util::SafeCopy;

namespace parquet {
namespace {
Expand All @@ -55,6 +56,9 @@ template <typename DType, bool is_signed>
struct CompareHelper {
using T = typename DType::c_type;

static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
"T is an unsigned numeric");

constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }

Expand Down Expand Up @@ -83,12 +87,24 @@ struct UnsignedCompareHelperBase {
using T = typename DType::c_type;
using UCType = typename std::make_unsigned<T>::type;

constexpr static T DefaultMin() { return std::numeric_limits<UCType>::max(); }
constexpr static T DefaultMax() { return std::numeric_limits<UCType>::lowest(); }
static_assert(!std::is_same<T, UCType>::value, "T is unsigned");
static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");

// NOTE: according to the C++ spec, unsigned-to-signed conversion is
// implementation-defined if the original value does not fit in the signed type
// (i.e., two's complement cannot be assumed even on mainstream machines,
// because the compiler may decide otherwise). Hence the use of `SafeCopy`
// below for deterministic bit-casting.
// (see "Integer conversions" in
// https://en.cppreference.com/w/cpp/language/implicit_conversion)

static const T DefaultMin() { return SafeCopy<T>(std::numeric_limits<UCType>::max()); }
static const T DefaultMax() { return 0; }

static T Coalesce(T val, T fallback) { return val; }

static inline bool Compare(int type_length, T a, T b) {
return ::arrow::util::SafeCopy<UCType>(a) < ::arrow::util::SafeCopy<UCType>(b);
static bool Compare(int type_length, T a, T b) {
return SafeCopy<UCType>(a) < SafeCopy<UCType>(b);
}

static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; }
Expand All @@ -107,12 +123,12 @@ struct CompareHelper<Int96Type, is_signed> {
using msb_type = typename std::conditional<is_signed, int32_t, uint32_t>::type;

static T DefaultMin() {
uint32_t kMsbMax = std::numeric_limits<msb_type>::max();
uint32_t kMsbMax = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::max());
uint32_t kMax = std::numeric_limits<uint32_t>::max();
return {kMax, kMax, kMsbMax};
}
static T DefaultMax() {
uint32_t kMsbMin = std::numeric_limits<msb_type>::min();
uint32_t kMsbMin = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::min());
uint32_t kMin = std::numeric_limits<uint32_t>::min();
return {kMin, kMin, kMsbMin};
}
Expand All @@ -122,8 +138,7 @@ struct CompareHelper<Int96Type, is_signed> {
if (a.value[2] != b.value[2]) {
// Only the MSB bit is by Signed comparison. For little-endian, this is the
// last bit of Int96 type.
return ::arrow::util::SafeCopy<msb_type>(a.value[2]) <
::arrow::util::SafeCopy<msb_type>(b.value[2]);
return SafeCopy<msb_type>(a.value[2]) < SafeCopy<msb_type>(b.value[2]);
} else if (a.value[1] != b.value[1]) {
return (a.value[1] < b.value[1]);
}
Expand Down Expand Up @@ -374,6 +389,28 @@ class TypedComparatorImpl : virtual public TypedComparator<DType> {
int type_length_;
};

// ARROW-11675: A hand-written version of GetMinMax(), to work around
// what looks like a MSVC code generation bug.
// This does not seem to be required for GetMinMaxSpaced().
template <>
std::pair<int32_t, int32_t>
TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
int64_t length) {
DCHECK_GT(length, 0);

const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
uint32_t min = std::numeric_limits<uint32_t>::max();
uint32_t max = std::numeric_limits<uint32_t>::lowest();

for (int64_t i = 0; i < length; i++) {
const auto val = unsigned_values[i];
min = std::min<uint32_t>(min, val);
max = std::max<uint32_t>(max, val);
}

return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
}

template <bool is_signed, typename DType>
std::pair<typename DType::c_type, typename DType::c_type>
TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values) {
Expand Down
73 changes: 54 additions & 19 deletions cpp/src/parquet/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/ubsan.h"

#include "parquet/column_reader.h"
#include "parquet/column_writer.h"
Expand All @@ -44,6 +45,7 @@

using arrow::default_memory_pool;
using arrow::MemoryPool;
using arrow::util::SafeCopy;

namespace BitUtil = arrow::BitUtil;

Expand Down Expand Up @@ -702,10 +704,11 @@ class TestStatisticsSortOrder : public ::testing::Test {
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
std::shared_ptr<parquet::RowGroupMetaData> rg_metadata = file_metadata->RowGroup(0);
for (int i = 0; i < static_cast<int>(fields_.size()); i++) {
ARROW_SCOPED_TRACE("Statistics for field #", i);
std::shared_ptr<parquet::ColumnChunkMetaData> cc_metadata =
rg_metadata->ColumnChunk(i);
ASSERT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin());
ASSERT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax());
EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin());
EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax());
}
}

Expand Down Expand Up @@ -934,20 +937,20 @@ template <typename Stats, typename Array, typename T = typename Array::value_typ
void AssertMinMaxAre(Stats stats, const Array& values, T expected_min, T expected_max) {
stats->Update(values.data(), values.size(), 0);
ASSERT_TRUE(stats->HasMinMax());
ASSERT_EQ(stats->min(), expected_min);
ASSERT_EQ(stats->max(), expected_max);
EXPECT_EQ(stats->min(), expected_min);
EXPECT_EQ(stats->max(), expected_max);
}

template <typename Stats, typename Array, typename T = typename Array::value_type>
template <typename Stats, typename Array, typename T = typename Stats::T>
void AssertMinMaxAre(Stats stats, const Array& values, const uint8_t* valid_bitmap,
T expected_min, T expected_max) {
auto n_values = values.size();
auto null_count = ::arrow::internal::CountSetBits(valid_bitmap, n_values, 0);
auto non_null_count = n_values - null_count;
stats->UpdateSpaced(values.data(), valid_bitmap, 0, non_null_count, null_count);
ASSERT_TRUE(stats->HasMinMax());
ASSERT_EQ(stats->min(), expected_min);
ASSERT_EQ(stats->max(), expected_max);
EXPECT_EQ(stats->min(), expected_min);
EXPECT_EQ(stats->max(), expected_max);
}

template <typename Stats, typename Array>
Expand All @@ -966,17 +969,17 @@ void AssertUnsetMinMax(Stats stats, const Array& values, const uint8_t* valid_bi
}

template <typename ParquetType, typename T = typename ParquetType::c_type>
void CheckExtremums() {
void CheckExtrema() {
using UT = typename std::make_unsigned<T>::type;

T smin = std::numeric_limits<T>::min();
T smax = std::numeric_limits<T>::max();
T umin = std::numeric_limits<UT>::min();
T umax = std::numeric_limits<UT>::max();
const T smin = std::numeric_limits<T>::min();
const T smax = std::numeric_limits<T>::max();
const T umin = SafeCopy<T>(std::numeric_limits<UT>::min());
const T umax = SafeCopy<T>(std::numeric_limits<UT>::max());

constexpr int kNumValues = 8;
std::array<T, kNumValues> values{0, smin, smax, umin,
umax, smin + 1, smax - 1, umin - 1};
umax, smin + 1, smax - 1, umax - 1};

NodePtr unsigned_node = PrimitiveNode::Make(
"uint", Repetition::OPTIONAL,
Expand All @@ -987,15 +990,47 @@ void CheckExtremums() {
LogicalType::Int(sizeof(T) * CHAR_BIT, true /*signed*/), ParquetType::type_num);
ColumnDescriptor signed_descr(signed_node, 1, 1);

auto unsigned_stats = MakeStatistics<ParquetType>(&unsigned_descr);
AssertMinMaxAre(unsigned_stats, values, umin, umax);
{
ARROW_SCOPED_TRACE("unsigned statistics: umin = ", umin, ", umax = ", umax,
", node type = ", unsigned_node->logical_type()->ToString(),
", physical type = ", unsigned_descr.physical_type(),
", sort order = ", unsigned_descr.sort_order());
auto unsigned_stats = MakeStatistics<ParquetType>(&unsigned_descr);
AssertMinMaxAre(unsigned_stats, values, umin, umax);
}
{
ARROW_SCOPED_TRACE("signed statistics: smin = ", smin, ", smax = ", smax,
", node type = ", signed_node->logical_type()->ToString(),
", physical type = ", signed_descr.physical_type(),
", sort order = ", signed_descr.sort_order());
auto signed_stats = MakeStatistics<ParquetType>(&signed_descr);
AssertMinMaxAre(signed_stats, values, smin, smax);
}

auto signed_stats = MakeStatistics<ParquetType>(&signed_descr);
AssertMinMaxAre(signed_stats, values, smin, smax);
// With validity bitmap
std::vector<bool> is_valid = {true, false, false, false, false, true, true, true};
std::shared_ptr<Buffer> valid_bitmap;
::arrow::BitmapFromVector(is_valid, &valid_bitmap);
{
ARROW_SCOPED_TRACE("spaced unsigned statistics: umin = ", umin, ", umax = ", umax,
", node type = ", unsigned_node->logical_type()->ToString(),
", physical type = ", unsigned_descr.physical_type(),
", sort order = ", unsigned_descr.sort_order());
auto unsigned_stats = MakeStatistics<ParquetType>(&unsigned_descr);
AssertMinMaxAre(unsigned_stats, values, valid_bitmap->data(), T{0}, umax - 1);
}
{
ARROW_SCOPED_TRACE("spaced signed statistics: smin = ", smin, ", smax = ", smax,
", node type = ", signed_node->logical_type()->ToString(),
", physical type = ", signed_descr.physical_type(),
", sort order = ", signed_descr.sort_order());
auto signed_stats = MakeStatistics<ParquetType>(&signed_descr);
AssertMinMaxAre(signed_stats, values, valid_bitmap->data(), smin + 1, smax - 1);
}
}

TEST(TestStatistic, Int32Extremums) { CheckExtremums<Int32Type>(); }
TEST(TestStatistic, Int64Extremums) { CheckExtremums<Int64Type>(); }
TEST(TestStatistic, Int32Extrema) { CheckExtrema<Int32Type>(); }
TEST(TestStatistic, Int64Extrema) { CheckExtrema<Int64Type>(); }

// PARQUET-1225: Float NaN values may lead to incorrect min-max
template <typename ParquetType>
Expand Down
4 changes: 3 additions & 1 deletion cpp/vcpkg.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
]
},
"benchmark",
"boost",
"boost-filesystem",
"boost-multiprecision",
"boost-system",
"brotli",
"bzip2",
"c-ares",
Expand Down
13 changes: 4 additions & 9 deletions dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ cmake -G "Visual Studio 16 2019" -A x64 ^
-DARROW_CXXFLAGS="/MP" ^
-DARROW_DATASET=ON ^
-DARROW_DEPENDENCY_SOURCE=VCPKG ^
-DARROW_FLIGHT=ON ^
-DARROW_FLIGHT=OFF ^
-DARROW_MIMALLOC=ON ^
-DARROW_PARQUET=ON ^
-DARROW_PYTHON=OFF ^
Expand All @@ -79,13 +79,8 @@ cmake --build . --target INSTALL --config Release || exit /B 1

@rem Test Arrow C++ library

@rem TODO(ARROW-11675): Uncomment the below
@rem and troubleshoot two test failures:
@rem - TestStatisticsSortOrder/0.MinMax
@rem - TestStatistic.Int32Extremums

@rem ctest --output-on-failure ^
@rem --parallel %NUMBER_OF_PROCESSORS% ^
@rem --timeout 300 || exit /B 1
ctest --output-on-failure ^
--parallel %NUMBER_OF_PROCESSORS% ^
--timeout 300 || exit /B 1

popd
10 changes: 3 additions & 7 deletions dev/tasks/vcpkg-tests/github.windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,11 @@ jobs:
git -C arrow checkout FETCH_HEAD
git -C arrow submodule update --init --recursive
- name: Remove and Reinstall vcpkg
# As of January 2021, the version of vcpkg that is preinstalled on the
# Github Actions windows-2019 image is 2020.11.12, as noted at
# https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md
# This version of vcpkg has a bug that causes the installation of
# aws-cpp-sdk to fail. See details at
# https://github.com/awslabs/aws-c-common/issues/734
# and https://github.com/microsoft/vcpkg/pull/14716.
# When running vcpkg in Github Actions on Windows, remove the
# preinstalled vcpkg and install the newest version from source.
# Versions of vcpkg rapidly stop working until updated, and
# the safest and most reliable way to update vcpkg is simply
# to remove and reinstall it.
shell: cmd
run: |
CALL vcpkg integrate remove 2>NUL
Expand Down