From 7be757d84b0a8452e05b8cbebb7a5e3d662a5309 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 30 Dec 2022 15:13:35 +0530 Subject: [PATCH 01/11] fix: parquet predicate push-down handling with NaNs --- cpp/src/arrow/dataset/file_parquet.cc | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 3c27bd2b00e..7affcad7c2d 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -135,6 +135,10 @@ std::optional ColumnChunkStatisticsAsExpression( min = maybe_min.MoveValueUnsafe(); max = maybe_max.MoveValueUnsafe(); + if(!(min->is_valid) && !(max->is_valid)){ + return std::nullopt; + } + if (min->Equals(max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); @@ -143,18 +147,25 @@ std::optional ColumnChunkStatisticsAsExpression( } return compute::or_(std::move(single_value), is_null(std::move(field_expr))); } - + auto lower_bound = - compute::greater_equal(field_expr, compute::literal(std::move(min))); - auto upper_bound = compute::less_equal(field_expr, compute::literal(std::move(max))); + compute::greater_equal(field_expr, compute::literal(min)); + auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); + auto in_range = compute::Expression(); + + if(!(min->is_valid)){ + in_range = std::move(upper_bound); + } else if(!(max->is_valid)){ + in_range = std::move(lower_bound); + } else { + in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); + } - auto in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); if (statistics->null_count() != 0) { return compute::or_(std::move(in_range), compute::is_null(field_expr)); } return in_range; } - return std::nullopt; } From cf1b41d6a41c7ccd446a3fc54f9db66f0ed9b2fc Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 3 Jan 2023 01:04:47 +0530 Subject: [PATCH 02/11] fix: checking for NaN instead of null values --- cpp/src/arrow/dataset/file_parquet.cc | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 7affcad7c2d..d1d491d9597 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -98,6 +98,17 @@ Result> GetSchemaManifest( return manifest; } +bool isNan(const Scalar& value) { + if (value.type->Equals(*float32())) { + const FloatScalar& float_scalar = checked_cast(value); + return isnan(float_scalar.value); + } else if (value.type->Equals(*float64())) { + const DoubleScalar& double_scalar = checked_cast(value); + return isnan(double_scalar.value); + } + return false; +} + std::optional ColumnChunkStatisticsAsExpression( const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) { // For the remaining of this function, failure to extract/parse statistics @@ -135,10 +146,10 @@ std::optional ColumnChunkStatisticsAsExpression( min = maybe_min.MoveValueUnsafe(); max = maybe_max.MoveValueUnsafe(); - if(!(min->is_valid) && !(max->is_valid)){ + if (isNan(*min) && isNan(*max)) { return std::nullopt; } - + if (min->Equals(max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); @@ -147,15 +158,14 @@ std::optional ColumnChunkStatisticsAsExpression( } return compute::or_(std::move(single_value), is_null(std::move(field_expr))); } - - auto lower_bound = - compute::greater_equal(field_expr, compute::literal(min)); + + auto lower_bound = compute::greater_equal(field_expr, compute::literal(min)); auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); - auto in_range = compute::Expression(); + compute::Expression in_range; - if(!(min->is_valid)){ + if (isNan(*min)) { in_range = std::move(upper_bound); - } else if(!(max->is_valid)){ + } else if (isNan(*max)) { in_range = std::move(lower_bound); } else { in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); From caeba9bafffc48398ff4eb48a6fb1f505e7af8ff Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 3 Jan 2023 10:56:57 +0530 Subject: [PATCH 03/11] fix: using std namespace for nan() function --- cpp/src/arrow/dataset/file_parquet.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index d1d491d9597..aa4a00453be 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -101,10 +101,10 @@ Result> GetSchemaManifest( bool isNan(const Scalar& value) { if (value.type->Equals(*float32())) { const FloatScalar& float_scalar = checked_cast(value); - return isnan(float_scalar.value); + return std::isnan(float_scalar.value); } else if (value.type->Equals(*float64())) { const DoubleScalar& double_scalar = checked_cast(value); - return isnan(double_scalar.value); + return std::isnan(double_scalar.value); } return false; } From e3f2f5edf78adc3b036801720670aed56b4c841c Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 5 Jan 2023 01:42:35 +0530 Subject: [PATCH 04/11] review: coding convention, type id check, checking validity, comment on usage --- cpp/src/arrow/dataset/file_parquet.cc | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index aa4a00453be..bab6ebc7098 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -98,13 +98,15 @@ Result> GetSchemaManifest( return manifest; } -bool isNan(const Scalar& value) { - if (value.type->Equals(*float32())) { - const FloatScalar& float_scalar = checked_cast(value); - return std::isnan(float_scalar.value); - } else if (value.type->Equals(*float64())) { - const DoubleScalar& double_scalar = checked_cast(value); - return std::isnan(double_scalar.value); +bool IsNan(const Scalar& value) { + if (value.is_valid) { + if (value.type->id() == Type::FLOAT) { + const FloatScalar& float_scalar = checked_cast(value); + return std::isnan(float_scalar.value); + } else if (value.type->id() == Type::DOUBLE) { + const DoubleScalar& double_scalar = checked_cast(value); + return std::isnan(double_scalar.value); + } } return false; } @@ -146,7 +148,10 @@ std::optional ColumnChunkStatisticsAsExpression( min = maybe_min.MoveValueUnsafe(); max = maybe_max.MoveValueUnsafe(); - if (isNan(*min) && isNan(*max)) { + // Since the minimum & maximum values are NaN, useful statistics + // cannot be extracted for checking the presence of a value within + // range + if (IsNan(*min) && IsNan(*max)) { return std::nullopt; } @@ -163,9 +168,11 @@ std::optional ColumnChunkStatisticsAsExpression( auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); compute::Expression in_range; - if (isNan(*min)) { + // If either minimum or maximum is NaN, it should be ignored for the + // range computation + if (IsNan(*min)) { in_range = std::move(upper_bound); - } else if (isNan(*max)) { + } else if (IsNan(*max)) { in_range = std::move(lower_bound); } else { in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); From 3b19ece519e414a7f14f75fce0793a24042b8611 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 6 Jan 2023 05:06:01 +0530 Subject: [PATCH 05/11] fix: move if condition for NaN check on min and max --- cpp/src/arrow/dataset/file_parquet.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index bab6ebc7098..9980f2316e0 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -148,13 +148,6 @@ std::optional ColumnChunkStatisticsAsExpression( min = maybe_min.MoveValueUnsafe(); max = maybe_max.MoveValueUnsafe(); - // Since the minimum & maximum values are NaN, useful statistics - // cannot be extracted for checking the presence of a value within - // range - if (IsNan(*min) && IsNan(*max)) { - return std::nullopt; - } - if (min->Equals(max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); @@ -164,6 +157,13 @@ std::optional ColumnChunkStatisticsAsExpression( return compute::or_(std::move(single_value), is_null(std::move(field_expr))); } + // Since the minimum & maximum values are NaN, useful statistics + // cannot be extracted for checking the presence of a value within + // range + if (IsNan(*min) && IsNan(*max)) { + return std::nullopt; + } + auto lower_bound = compute::greater_equal(field_expr, compute::literal(min)); auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); compute::Expression in_range; From 3fa5bb75ad5918c62012b1587282a783556ba205 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 12 Jan 2023 13:25:53 +0530 Subject: [PATCH 06/11] feat: restructure function and add test --- cpp/src/arrow/dataset/file_parquet.cc | 120 +++++++++++---------- cpp/src/arrow/dataset/file_parquet.h | 2 + cpp/src/arrow/dataset/file_parquet_test.cc | 11 ++ 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 9980f2316e0..ba2de8b77c5 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -125,65 +125,9 @@ std::optional ColumnChunkStatisticsAsExpression( auto column_metadata = metadata.ColumnChunk(schema_field.column_index); auto statistics = column_metadata->statistics(); - if (statistics == nullptr) { - return std::nullopt; - } - const auto& field = schema_field.field; - auto field_expr = compute::field_ref(field->name()); - - // Optimize for corner case where all values are nulls - if (statistics->num_values() == 0 && statistics->null_count() > 0) { - return is_null(std::move(field_expr)); - } - - std::shared_ptr min, max; - if (!StatisticsAsScalars(*statistics, &min, &max).ok()) { - return std::nullopt; - } - - auto maybe_min = min->CastTo(field->type()); - auto maybe_max = max->CastTo(field->type()); - if (maybe_min.ok() && maybe_max.ok()) { - min = maybe_min.MoveValueUnsafe(); - max = maybe_max.MoveValueUnsafe(); - - if (min->Equals(max)) { - auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); - - if (statistics->null_count() == 0) { - return single_value; - } - return compute::or_(std::move(single_value), is_null(std::move(field_expr))); - } + return ParquetFileFragment::EvaluateStatisticsAsExpression(field, statistics); - // Since the minimum & maximum values are NaN, useful statistics - // cannot be extracted for checking the presence of a value within - // range - if (IsNan(*min) && IsNan(*max)) { - return std::nullopt; - } - - auto lower_bound = compute::greater_equal(field_expr, compute::literal(min)); - auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); - compute::Expression in_range; - - // If either minimum or maximum is NaN, it should be ignored for the - // range computation - if (IsNan(*min)) { - in_range = std::move(upper_bound); - } else if (IsNan(*max)) { - in_range = std::move(lower_bound); - } else { - in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); - } - - if (statistics->null_count() != 0) { - return compute::or_(std::move(in_range), compute::is_null(field_expr)); - } - return in_range; - } - return std::nullopt; } void AddColumnIndices(const SchemaField& schema_field, @@ -334,6 +278,68 @@ Result IsSupportedParquetFile(const ParquetFileFormat& format, } // namespace +std::optional ParquetFileFragment::EvaluateStatisticsAsExpression(std::shared_ptr field,std::shared_ptr statistics){ + if (statistics == nullptr) { + return std::nullopt; + } + + auto field_expr = compute::field_ref(field->name()); + + // Optimize for corner case where all values are nulls + if (statistics->num_values() == 0 && statistics->null_count() > 0) { + return is_null(std::move(field_expr)); + } + + std::shared_ptr min, max; + if (!StatisticsAsScalars(*statistics, &min, &max).ok()) { + return std::nullopt; + } + + auto maybe_min = min->CastTo(field->type()); + auto maybe_max = max->CastTo(field->type()); + + if (maybe_min.ok() && maybe_max.ok()) { + min = maybe_min.MoveValueUnsafe(); + max = maybe_max.MoveValueUnsafe(); + + if (min->Equals(max)) { + auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); + + if (statistics->null_count() == 0) { + return single_value; + } + return compute::or_(std::move(single_value), is_null(std::move(field_expr))); + } + + // Since the minimum & maximum values are NaN, useful statistics + // cannot be extracted for checking the presence of a value within + // range + if (IsNan(*min) && IsNan(*max)) { + return std::nullopt; + } + + auto lower_bound = compute::greater_equal(field_expr, compute::literal(min)); + auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); + compute::Expression in_range; + + // If either minimum or maximum is NaN, it should be ignored for the + // range computation + if (IsNan(*min)) { + in_range = std::move(upper_bound); + } else if (IsNan(*max)) { + in_range = std::move(lower_bound); + } else { + in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); + } + + if (statistics->null_count() != 0) { + return compute::or_(std::move(in_range), compute::is_null(field_expr)); + } + return in_range; + } + return std::nullopt; +} + ParquetFileFormat::ParquetFileFormat() : FileFormat(std::make_shared()) {} diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 1087fb9f9de..a14e2442b3b 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -163,6 +163,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { Result> Subset(compute::Expression predicate); Result> Subset(std::vector row_group_ids); + static std::optional EvaluateStatisticsAsExpression(std::shared_ptr field,std::shared_ptr statistics); + private: ParquetFileFragment(FileSource source, std::shared_ptr format, compute::Expression partition_expression, diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 1d2febf0fe8..25b1722aefb 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -36,6 +36,9 @@ #include "parquet/arrow/writer.h" #include "parquet/metadata.h" +#include "parquet/statistics.h" +#include "parquet/types.h" + namespace arrow { @@ -632,5 +635,13 @@ INSTANTIATE_TEST_SUITE_P(TestScan, TestParquetFileFormatScan, ::testing::ValuesIn(TestFormatParams::Values()), TestFormatParams::ToTestNameString); +TEST(TestParquetStatistics, NullMinAndMax){ + auto field = ::arrow::field("a", float32()); + auto node = parquet::schema::PrimitiveNode::Make("float", parquet::Repetition::OPTIONAL, parquet::Type::FLOAT); + parquet::ColumnDescriptor column_desc(node, 4, 1); + auto statistics = parquet::Statistics::Make(&column_desc, std::string("NaN"), std::string("NaN"), 5, 2, 1, true, true, true); + auto stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(field, statistics); +} + } // namespace dataset } // namespace arrow From 972bfd7b48d59afd5ebaf2e81fb966e58b9969b2 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 31 Jan 2023 04:46:46 +0530 Subject: [PATCH 07/11] fix: using parquet file for testing --- cpp/src/arrow/dataset/file_parquet.cc | 18 +++++++++--------- cpp/src/arrow/dataset/file_parquet.h | 3 ++- cpp/src/arrow/dataset/file_parquet_test.cc | 19 ++++++++++++------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index ba2de8b77c5..9338e04fc9b 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -127,7 +127,6 @@ std::optional ColumnChunkStatisticsAsExpression( auto statistics = column_metadata->statistics(); const auto& field = schema_field.field; return ParquetFileFragment::EvaluateStatisticsAsExpression(field, statistics); - } void AddColumnIndices(const SchemaField& schema_field, @@ -278,7 +277,8 @@ Result IsSupportedParquetFile(const ParquetFileFormat& format, } // namespace -std::optional ParquetFileFragment::EvaluateStatisticsAsExpression(std::shared_ptr field,std::shared_ptr statistics){ +std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( + std::shared_ptr field, std::shared_ptr statistics) { if (statistics == nullptr) { return std::nullopt; } @@ -302,6 +302,13 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr min = maybe_min.MoveValueUnsafe(); max = maybe_max.MoveValueUnsafe(); + // Since the minimum & maximum values are NaN, useful statistics + // cannot be extracted for checking the presence of a value within + // range + if (IsNan(*min) && IsNan(*max)) { + return std::nullopt; + } + if (min->Equals(max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); @@ -311,13 +318,6 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr return compute::or_(std::move(single_value), is_null(std::move(field_expr))); } - // Since the minimum & maximum values are NaN, useful statistics - // cannot be extracted for checking the presence of a value within - // range - if (IsNan(*min) && IsNan(*max)) { - return std::nullopt; - } - auto lower_bound = compute::greater_equal(field_expr, compute::literal(min)); auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); compute::Expression in_range; diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index a14e2442b3b..d8d0be234fe 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -163,7 +163,8 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { Result> Subset(compute::Expression predicate); Result> Subset(std::vector row_group_ids); - static std::optional EvaluateStatisticsAsExpression(std::shared_ptr field,std::shared_ptr statistics); + static std::optional EvaluateStatisticsAsExpression( + std::shared_ptr field, std::shared_ptr statistics); private: ParquetFileFragment(FileSource source, std::shared_ptr format, diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 25b1722aefb..7c4b58d6e42 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -32,14 +32,15 @@ #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/io_util.h" #include "arrow/util/range.h" #include "parquet/arrow/writer.h" +#include "parquet/file_reader.h" #include "parquet/metadata.h" #include "parquet/statistics.h" #include "parquet/types.h" - namespace arrow { using internal::checked_pointer_cast; @@ -635,12 +636,16 @@ INSTANTIATE_TEST_SUITE_P(TestScan, TestParquetFileFormatScan, ::testing::ValuesIn(TestFormatParams::Values()), TestFormatParams::ToTestNameString); -TEST(TestParquetStatistics, NullMinAndMax){ - auto field = ::arrow::field("a", float32()); - auto node = parquet::schema::PrimitiveNode::Make("float", parquet::Repetition::OPTIONAL, parquet::Type::FLOAT); - parquet::ColumnDescriptor column_desc(node, 4, 1); - auto statistics = parquet::Statistics::Make(&column_desc, std::string("NaN"), std::string("NaN"), 5, 2, 1, true, true, true); - auto stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(field, statistics); +TEST(TestParquetStatistics, NullMax) { + auto field = ::arrow::field("x", float32()); + ASSERT_OK_AND_ASSIGN(std::string dir_string, + arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); + auto reader = + parquet::ParquetFileReader::OpenFile(dir_string + "/nan_in_stats.parquet"); + auto statistics = reader->RowGroup(0)->metadata()->ColumnChunk(0)->statistics(); + auto stat_expression = + ParquetFileFragment::EvaluateStatisticsAsExpression(field, statistics); + EXPECT_EQ(stat_expression->ToString(), "(x >= 1)"); } } // namespace dataset From 8a0b47e50a438d06f608f00ada137cc04c4c6ef2 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Sat, 4 Feb 2023 11:44:43 +0530 Subject: [PATCH 08/11] feat: update submodule for parquet-testing file --- cpp/submodules/parquet-testing | 2 +- testing | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index e2d244ab9a8..33b4e23376c 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit e2d244ab9a84d382e3a50f55db41f362e450428b +Subproject commit 33b4e23376c28e489c6a08b9207829b29e4bffb8 diff --git a/testing b/testing index ecab1162cbe..d2c73bf7824 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit ecab1162cbec872e17d949ecc86181670aee045c +Subproject commit d2c73bf78246331d8e58b6f11aa8aa199cbb5929 From c2748077a4ccf751d3a33a4c658290d5d405ed79 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 8 Feb 2023 01:45:45 +0530 Subject: [PATCH 09/11] feat: using const ref arguments with EvaluateStatisticsAsExpression --- cpp/src/arrow/dataset/file_parquet.cc | 25 +++++++++++----------- cpp/src/arrow/dataset/file_parquet.h | 2 +- cpp/src/arrow/dataset/file_parquet_test.cc | 2 +- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 9338e04fc9b..709a35a795c 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -126,7 +126,12 @@ std::optional ColumnChunkStatisticsAsExpression( auto column_metadata = metadata.ColumnChunk(schema_field.column_index); auto statistics = column_metadata->statistics(); const auto& field = schema_field.field; - return ParquetFileFragment::EvaluateStatisticsAsExpression(field, statistics); + + if (statistics == nullptr) { + return std::nullopt; + } + + return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); } void AddColumnIndices(const SchemaField& schema_field, @@ -278,15 +283,11 @@ Result IsSupportedParquetFile(const ParquetFileFormat& format, } // namespace std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( - std::shared_ptr field, std::shared_ptr statistics) { - if (statistics == nullptr) { - return std::nullopt; - } - - auto field_expr = compute::field_ref(field->name()); + const Field& field, const parquet::Statistics& statistics) { + auto field_expr = compute::field_ref(field.name()); // Optimize for corner case where all values are nulls - if (statistics->num_values() == 0 && statistics->null_count() > 0) { + if (statistics.num_values() == 0 && statistics.null_count() > 0) { return is_null(std::move(field_expr)); } @@ -295,8 +296,8 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr return std::nullopt; } - auto maybe_min = min->CastTo(field->type()); - auto maybe_max = max->CastTo(field->type()); + auto maybe_min = min->CastTo(field.type()); + auto maybe_max = max->CastTo(field.type()); if (maybe_min.ok() && maybe_max.ok()) { min = maybe_min.MoveValueUnsafe(); @@ -312,7 +313,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr if (min->Equals(max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); - if (statistics->null_count() == 0) { + if (statistics.null_count() == 0) { return single_value; } return compute::or_(std::move(single_value), is_null(std::move(field_expr))); @@ -332,7 +333,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); } - if (statistics->null_count() != 0) { + if (statistics.null_count() != 0) { return compute::or_(std::move(in_range), compute::is_null(field_expr)); } return in_range; diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index d8d0be234fe..7fb8ca8191a 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -164,7 +164,7 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { Result> Subset(std::vector row_group_ids); static std::optional EvaluateStatisticsAsExpression( - std::shared_ptr field, std::shared_ptr statistics); + const Field& field, const parquet::Statistics& statistics); private: ParquetFileFragment(FileSource source, std::shared_ptr format, diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 7c4b58d6e42..dc9cd46c178 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -644,7 +644,7 @@ TEST(TestParquetStatistics, NullMax) { parquet::ParquetFileReader::OpenFile(dir_string + "/nan_in_stats.parquet"); auto statistics = reader->RowGroup(0)->metadata()->ColumnChunk(0)->statistics(); auto stat_expression = - ParquetFileFragment::EvaluateStatisticsAsExpression(field, statistics); + ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); EXPECT_EQ(stat_expression->ToString(), "(x >= 1)"); } From f07e4ac2d451baf7d2d8f65e8202b25d4146b127 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 8 Feb 2023 02:16:13 +0530 Subject: [PATCH 10/11] fix: wrong indirection for statistics object --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 709a35a795c..330312dc682 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -292,7 +292,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr } std::shared_ptr min, max; - if (!StatisticsAsScalars(*statistics, &min, &max).ok()) { + if (!StatisticsAsScalars(statistics, &min, &max).ok()) { return std::nullopt; } From 23bc0c0c95dc3fae7ebd444a7135084112e0af54 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 9 Feb 2023 22:19:33 +0530 Subject: [PATCH 11/11] feat: organise code to group together NaN handling --- cpp/src/arrow/dataset/file_parquet.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 330312dc682..cc7d7ee720d 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -303,13 +303,6 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr min = maybe_min.MoveValueUnsafe(); max = maybe_max.MoveValueUnsafe(); - // Since the minimum & maximum values are NaN, useful statistics - // cannot be extracted for checking the presence of a value within - // range - if (IsNan(*min) && IsNan(*max)) { - return std::nullopt; - } - if (min->Equals(max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); @@ -323,6 +316,13 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); compute::Expression in_range; + // Since the minimum & maximum values are NaN, useful statistics + // cannot be extracted for checking the presence of a value within + // range + if (IsNan(*min) && IsNan(*max)) { + return std::nullopt; + } + // If either minimum or maximum is NaN, it should be ignored for the // range computation if (IsNan(*min)) {