diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 3c27bd2b00e..cc7d7ee720d 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -98,6 +98,19 @@ Result> GetSchemaManifest( return manifest; } +bool IsNan(const Scalar& value) { + if (value.is_valid) { + if (value.type->id() == Type::FLOAT) { + const FloatScalar& float_scalar = checked_cast(value); + return std::isnan(float_scalar.value); + } else if (value.type->id() == Type::DOUBLE) { + const DoubleScalar& double_scalar = checked_cast(value); + return std::isnan(double_scalar.value); + } + } + return false; +} + std::optional ColumnChunkStatisticsAsExpression( const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) { // For the remaining of this function, failure to extract/parse statistics @@ -112,50 +125,13 @@ std::optional ColumnChunkStatisticsAsExpression( auto column_metadata = metadata.ColumnChunk(schema_field.column_index); auto statistics = column_metadata->statistics(); - if (statistics == nullptr) { - return std::nullopt; - } - const auto& field = schema_field.field; - auto field_expr = compute::field_ref(field->name()); - // Optimize for corner case where all values are nulls - if (statistics->num_values() == 0 && statistics->null_count() > 0) { - return is_null(std::move(field_expr)); - } - - std::shared_ptr min, max; - if (!StatisticsAsScalars(*statistics, &min, &max).ok()) { + if (statistics == nullptr) { return std::nullopt; } - auto maybe_min = min->CastTo(field->type()); - auto maybe_max = max->CastTo(field->type()); - if (maybe_min.ok() && maybe_max.ok()) { - min = maybe_min.MoveValueUnsafe(); - max = maybe_max.MoveValueUnsafe(); - - if (min->Equals(max)) { - auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); - - if (statistics->null_count() == 0) { - return single_value; - } - return compute::or_(std::move(single_value), is_null(std::move(field_expr))); - } - - auto lower_bound = - compute::greater_equal(field_expr, compute::literal(std::move(min))); - auto upper_bound = compute::less_equal(field_expr, compute::literal(std::move(max))); - - auto in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); - if (statistics->null_count() != 0) { - return compute::or_(std::move(in_range), compute::is_null(field_expr)); - } - return in_range; - } - - return std::nullopt; + return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); } void AddColumnIndices(const SchemaField& schema_field, @@ -306,6 +282,65 @@ Result IsSupportedParquetFile(const ParquetFileFormat& format, } // namespace +std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( + const Field& field, const parquet::Statistics& statistics) { + auto field_expr = compute::field_ref(field.name()); + + // Optimize for corner case where all values are nulls + if (statistics.num_values() == 0 && statistics.null_count() > 0) { + return is_null(std::move(field_expr)); + } + + std::shared_ptr min, max; + if (!StatisticsAsScalars(statistics, &min, &max).ok()) { + return std::nullopt; + } + + auto maybe_min = min->CastTo(field.type()); + auto maybe_max = max->CastTo(field.type()); + + if (maybe_min.ok() && maybe_max.ok()) { + min = maybe_min.MoveValueUnsafe(); + max = maybe_max.MoveValueUnsafe(); + + if (min->Equals(max)) { + auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); + + if (statistics.null_count() == 0) { + return single_value; + } + return compute::or_(std::move(single_value), is_null(std::move(field_expr))); + } + + auto lower_bound = compute::greater_equal(field_expr, compute::literal(min)); + auto upper_bound = compute::less_equal(field_expr, compute::literal(max)); + compute::Expression in_range; + + // Since the minimum & maximum values are NaN, useful statistics + // cannot be extracted for checking the presence of a value within + // range + if (IsNan(*min) && IsNan(*max)) { + return std::nullopt; + } + + // If either minimum or maximum is NaN, it should be ignored for the + // range computation + if (IsNan(*min)) { + in_range = std::move(upper_bound); + } else if (IsNan(*max)) { + in_range = std::move(lower_bound); + } else { + in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); + } + + if (statistics.null_count() != 0) { + return compute::or_(std::move(in_range), compute::is_null(field_expr)); + } + return in_range; + } + return std::nullopt; +} + ParquetFileFormat::ParquetFileFormat() : FileFormat(std::make_shared()) {} diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 1087fb9f9de..7fb8ca8191a 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -163,6 +163,9 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { Result> Subset(compute::Expression predicate); Result> Subset(std::vector row_group_ids); + static std::optional EvaluateStatisticsAsExpression( + const Field& field, const parquet::Statistics& statistics); + private: ParquetFileFragment(FileSource source, std::shared_ptr format, compute::Expression partition_expression, diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 1d2febf0fe8..dc9cd46c178 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -32,10 +32,14 @@ #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/io_util.h" #include "arrow/util/range.h" #include "parquet/arrow/writer.h" +#include "parquet/file_reader.h" #include "parquet/metadata.h" +#include "parquet/statistics.h" +#include "parquet/types.h" namespace arrow { @@ -632,5 +636,17 @@ INSTANTIATE_TEST_SUITE_P(TestScan, TestParquetFileFormatScan, ::testing::ValuesIn(TestFormatParams::Values()), TestFormatParams::ToTestNameString); +TEST(TestParquetStatistics, NullMax) { + auto field = ::arrow::field("x", float32()); + ASSERT_OK_AND_ASSIGN(std::string dir_string, + arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); + auto reader = + parquet::ParquetFileReader::OpenFile(dir_string + "/nan_in_stats.parquet"); + auto statistics = reader->RowGroup(0)->metadata()->ColumnChunk(0)->statistics(); + auto stat_expression = + ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); + EXPECT_EQ(stat_expression->ToString(), "(x >= 1)"); +} + } // namespace dataset } // namespace arrow diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index e2d244ab9a8..33b4e23376c 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit e2d244ab9a84d382e3a50f55db41f362e450428b +Subproject commit 33b4e23376c28e489c6a08b9207829b29e4bffb8 diff --git a/testing b/testing index ecab1162cbe..d2c73bf7824 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit ecab1162cbec872e17d949ecc86181670aee045c +Subproject commit d2c73bf78246331d8e58b6f11aa8aa199cbb5929