From f5c8e796d6e699a7f0f8a22db9bcb0dc55952ea3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 10 Feb 2023 11:54:22 +0800 Subject: [PATCH 1/2] GH-14870: [C++][Parquet] Fix parsing stats from min_value/max_value --- .../parquet/arrow/arrow_reader_writer_test.cc | 24 +++++++++++++++++-- cpp/src/parquet/column_reader.cc | 11 ++++++--- cpp/src/parquet/file_deserialize_test.cc | 6 ++++- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ef247786a38..1778a154c6d 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4083,6 +4083,16 @@ TEST_P(TestArrowWriteDictionary, Statistics) { std::vector> expected_min_max_ = { {"a", "b"}, {"b", "c"}, {"a", "d"}, {"", ""}}; + const std::vector>> expected_min_by_page = { + {{"b", "a"}, {"b", "a"}}, {{"b", "b"}, {"b", "b"}}, {{"c", "a"}, {"c", "a"}}}; + const std::vector>> expected_max_by_page = { + {{"b", "a"}, {"b", "a"}}, {{"c", "c"}, {"c", "c"}}, {{"d", "a"}, {"d", "a"}}}; + const std::vector>> expected_has_min_max_by_page = { + {{true, true}, {true, true}}, + {{true, true}, {true, true}}, + {{true, true}, {true, true}}, + {{false}, {false}}}; + for (std::size_t case_index = 0; case_index < test_dictionaries.size(); case_index++) { SCOPED_TRACE(test_dictionaries[case_index]->type()->ToString()); ASSERT_OK_AND_ASSIGN(std::shared_ptr<::arrow::Array> dict_encoded, @@ -4143,8 +4153,18 @@ TEST_P(TestArrowWriteDictionary, Statistics) { DataPage* data_page = (DataPage*)page.get(); const EncodedStatistics& stats = data_page->statistics(); EXPECT_EQ(stats.null_count, expected_null_by_page[case_index][page_index]); - EXPECT_EQ(stats.has_min, false); - EXPECT_EQ(stats.has_max, false); + + auto expect_has_min_max = + expected_has_min_max_by_page[case_index][row_group_index][page_index]; + EXPECT_EQ(stats.has_min, expect_has_min_max); + EXPECT_EQ(stats.has_max, expect_has_min_max); + if (expect_has_min_max) { + EXPECT_EQ(stats.min(), + expected_min_by_page[case_index][row_group_index][page_index]); + EXPECT_EQ(stats.max(), + expected_max_by_page[case_index][row_group_index][page_index]); + } + EXPECT_EQ(data_page->num_values(), expected_valid_by_page[case_index][page_index] + expected_null_by_page[case_index][page_index]); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3670af49fbf..ed9377b0112 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -211,10 +211,15 @@ EncodedStatistics ExtractStatsFromHeader(const H& header) { return page_statistics; } const format::Statistics& stats = header.statistics; - if (stats.__isset.max) { + // Use the new V2 min-max statistics over the former one if it is filled + if (stats.__isset.max_value && stats.__isset.min_value) { + // TODO: check if the column_order is TYPE_DEFINED_ORDER. + page_statistics.set_max(stats.max_value); + page_statistics.set_min(stats.min_value); + } else if (stats.__isset.max && stats.__isset.min) { + // TODO: check created_by to see if it is corrupted for some types. + // TODO: check if the sort_order is SIGNED. page_statistics.set_max(stats.max); - } - if (stats.__isset.min) { page_statistics.set_min(stats.min); } if (stats.__isset.null_count) { diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 76f34a1eecb..114c4f46ee7 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -514,7 +514,11 @@ TEST_F(TestPageSerde, DataPageV2) { TEST_F(TestPageSerde, TestLargePageHeaders) { int stats_size = 256 * 1024; // 256 KB - AddDummyStats(stats_size, data_page_header_); + AddDummyStats(stats_size, data_page_header_, /*fill_all_stats=*/false); + + // AddDummyStats() above has only set max which results in an invalid statistics. + // Set min explicitly here to make it valid. + data_page_header_.statistics.__set_min(""); // Any number to verify metadata roundtrip const int32_t num_rows = 4141; From ccdcbbb5a49dbcc460e410f3c25a50e593f8004f Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 17 Feb 2023 10:16:25 +0800 Subject: [PATCH 2/2] support parsing only one side of min/max --- cpp/src/parquet/column_reader.cc | 20 ++++++++++++++------ cpp/src/parquet/file_deserialize_test.cc | 6 +----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index ed9377b0112..bd9d3483d3f 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -212,15 +212,23 @@ EncodedStatistics ExtractStatsFromHeader(const H& header) { } const format::Statistics& stats = header.statistics; // Use the new V2 min-max statistics over the former one if it is filled - if (stats.__isset.max_value && stats.__isset.min_value) { + if (stats.__isset.max_value || stats.__isset.min_value) { // TODO: check if the column_order is TYPE_DEFINED_ORDER. - page_statistics.set_max(stats.max_value); - page_statistics.set_min(stats.min_value); - } else if (stats.__isset.max && stats.__isset.min) { + if (stats.__isset.max_value) { + page_statistics.set_max(stats.max_value); + } + if (stats.__isset.min_value) { + page_statistics.set_min(stats.min_value); + } + } else if (stats.__isset.max || stats.__isset.min) { // TODO: check created_by to see if it is corrupted for some types. // TODO: check if the sort_order is SIGNED. - page_statistics.set_max(stats.max); - page_statistics.set_min(stats.min); + if (stats.__isset.max) { + page_statistics.set_max(stats.max); + } + if (stats.__isset.min) { + page_statistics.set_min(stats.min); + } } if (stats.__isset.null_count) { page_statistics.set_null_count(stats.null_count); diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 114c4f46ee7..76f34a1eecb 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -514,11 +514,7 @@ TEST_F(TestPageSerde, DataPageV2) { TEST_F(TestPageSerde, TestLargePageHeaders) { int stats_size = 256 * 1024; // 256 KB - AddDummyStats(stats_size, data_page_header_, /*fill_all_stats=*/false); - - // AddDummyStats() above has only set max which results in an invalid statistics. - // Set min explicitly here to make it valid. - data_page_header_.statistics.__set_min(""); + AddDummyStats(stats_size, data_page_header_); // Any number to verify metadata roundtrip const int32_t num_rows = 4141;