diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ef247786a38..1778a154c6d 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4083,6 +4083,16 @@ TEST_P(TestArrowWriteDictionary, Statistics) { std::vector> expected_min_max_ = { {"a", "b"}, {"b", "c"}, {"a", "d"}, {"", ""}}; + const std::vector>> expected_min_by_page = { + {{"b", "a"}, {"b", "a"}}, {{"b", "b"}, {"b", "b"}}, {{"c", "a"}, {"c", "a"}}}; + const std::vector>> expected_max_by_page = { + {{"b", "a"}, {"b", "a"}}, {{"c", "c"}, {"c", "c"}}, {{"d", "a"}, {"d", "a"}}}; + const std::vector>> expected_has_min_max_by_page = { + {{true, true}, {true, true}}, + {{true, true}, {true, true}}, + {{true, true}, {true, true}}, + {{false}, {false}}}; + for (std::size_t case_index = 0; case_index < test_dictionaries.size(); case_index++) { SCOPED_TRACE(test_dictionaries[case_index]->type()->ToString()); ASSERT_OK_AND_ASSIGN(std::shared_ptr<::arrow::Array> dict_encoded, @@ -4143,8 +4153,18 @@ TEST_P(TestArrowWriteDictionary, Statistics) { DataPage* data_page = (DataPage*)page.get(); const EncodedStatistics& stats = data_page->statistics(); EXPECT_EQ(stats.null_count, expected_null_by_page[case_index][page_index]); - EXPECT_EQ(stats.has_min, false); - EXPECT_EQ(stats.has_max, false); + + auto expect_has_min_max = + expected_has_min_max_by_page[case_index][row_group_index][page_index]; + EXPECT_EQ(stats.has_min, expect_has_min_max); + EXPECT_EQ(stats.has_max, expect_has_min_max); + if (expect_has_min_max) { + EXPECT_EQ(stats.min(), + expected_min_by_page[case_index][row_group_index][page_index]); + EXPECT_EQ(stats.max(), + expected_max_by_page[case_index][row_group_index][page_index]); + } + EXPECT_EQ(data_page->num_values(), expected_valid_by_page[case_index][page_index] + expected_null_by_page[case_index][page_index]); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3670af49fbf..bd9d3483d3f 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -211,11 +211,24 @@ EncodedStatistics ExtractStatsFromHeader(const H& header) { return page_statistics; } const format::Statistics& stats = header.statistics; - if (stats.__isset.max) { - page_statistics.set_max(stats.max); - } - if (stats.__isset.min) { - page_statistics.set_min(stats.min); + // Use the new V2 min-max statistics over the former one if it is filled + if (stats.__isset.max_value || stats.__isset.min_value) { + // TODO: check if the column_order is TYPE_DEFINED_ORDER. + if (stats.__isset.max_value) { + page_statistics.set_max(stats.max_value); + } + if (stats.__isset.min_value) { + page_statistics.set_min(stats.min_value); + } + } else if (stats.__isset.max || stats.__isset.min) { + // TODO: check created_by to see if it is corrupted for some types. + // TODO: check if the sort_order is SIGNED. + if (stats.__isset.max) { + page_statistics.set_max(stats.max); + } + if (stats.__isset.min) { + page_statistics.set_min(stats.min); + } } if (stats.__isset.null_count) { page_statistics.set_null_count(stats.null_count);