From a0239aabc26b34a465b91aace151da861fedbcdc Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 11:23:24 -0500 Subject: [PATCH 01/66] Add NanNullOptions for IsNull kernel --- cpp/src/arrow/compute/api_scalar.cc | 5 ++++- cpp/src/arrow/compute/api_scalar.h | 12 +++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 1feb4e7eee0..cf45b696ad8 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -451,7 +451,6 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti // Validity functions SCALAR_EAGER_UNARY(IsValid, "is_valid") -SCALAR_EAGER_UNARY(IsNull, "is_null") SCALAR_EAGER_UNARY(IsNan, "is_nan") Result FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) { @@ -471,6 +470,10 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } +Result IsNull(const Datum& arg, const NanNullOptions& options, ExecContext* ctx) { + return CallFunction("is_null", {arg}, &options, ctx); +} + // ---------------------------------------------------------------------- // Temporal functions diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index e07e41569a1..f8c208f5506 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -209,6 +209,15 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { int64_t start, stop, step; }; +class ARROW_EXPORT NanNullOptions : public FunctionOptions { + public: + explicit NanNullOptions(bool nan_is_null); + NanNullOptions(); + constexpr static char const kTypeName[] = "NanNullOptions"; + + bool nan_is_null; +}; + enum CompareOperator : int8_t { EQUAL, NOT_EQUAL, @@ -727,13 +736,14 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// false otherwise /// /// \param[in] values input to examine for nullity +/// \param[in] options NanNullOptions /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, const NanNullOptions& options, ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise From bdc3453b77c39d95ce26b56e350b0c338177067b Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 11:31:37 -0500 Subject: [PATCH 02/66] Add kNanNullOptionsType to RegisterScalarOptions --- cpp/src/arrow/compute/api_scalar.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index cf45b696ad8..d0ba0357ad4 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -159,6 +159,8 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( static auto kDayOfWeekOptionsType = GetFunctionOptionsType( DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering), DataMember("week_start", &DayOfWeekOptions::week_start)); +static auto kNanNullOptionsType = GetFunctionOptionsType( + DataMember("nan_is_null", &NanNullOptions::nan_is_null)); } // namespace } // namespace internal @@ -281,6 +283,12 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start week_start(week_start) {} constexpr char DayOfWeekOptions::kTypeName[]; +NanNullOptions::NanNullOptions(bool nan_is_null) + : FunctionOptions(internal::kNanNullOptionsType), + nan_is_null(nan_is_null) {} +NanNullOptions::NanNullOptions() : NanNullOptions(false) {} +constexpr char NanNullOptions::kTypeName[]; + namespace internal { void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType)); @@ -299,6 +307,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kNanNullOptionsType)); } } // namespace internal From 518bd3bb959cc78fdda18c613dfcc6508f8dda44 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 12:18:58 -0500 Subject: [PATCH 03/66] Add Init param for MafeFunction and docs for is_null kernel --- .../arrow/compute/kernels/scalar_validity.cc | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index ead88abc0f2..fd60d012a70 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/util/bit_util.h" @@ -76,6 +77,15 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + bool nan_is_null = false; + if (in.is_valid && in.ToString() == "nan") { + if (in.type == float16() || in.type == float32() || in.type == float64()) { + nan_is_null = true; + checked_cast(out)->value = nan_is_null; + return Status::OK(); + } + } + checked_cast(out)->value = !in.is_valid; return Status::OK(); } @@ -104,11 +114,11 @@ struct IsNanOperator { void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - MemAllocation::type mem_allocation, bool can_write_into_slices) { + MemAllocation::type mem_allocation, bool can_write_into_slices, const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR) { Arity arity{static_cast(in_types.size())}; - auto func = std::make_shared(name, arity, doc); + auto func = std::make_shared(name, arity, doc, default_options); - ScalarKernel kernel(std::move(in_types), out_type, exec); + ScalarKernel kernel(std::move(in_types), out_type, exec, init); kernel.null_handling = NullHandling::OUTPUT_NOT_NULL; kernel.can_write_into_slices = can_write_into_slices; kernel.mem_allocation = mem_allocation; @@ -202,9 +212,9 @@ const FunctionDoc is_inf_doc( ("For each input value, emit true iff the value is infinite (inf or -inf)."), {"values"}); -const FunctionDoc is_null_doc("Return true if null", - ("For each input value, emit true iff the value is null."), - {"values"}); +const FunctionDoc is_null_doc("Return true if null, NaN can be considered as null", + ("For each input value, emit true iff the value is null. Default behavior is to emit false for NaN values. True can be emitted for NaN values by toggling NanNullOptions flag."), + {"values"}, "NanNullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -212,13 +222,15 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true); + /*can_write_into_slices=*/true, &kNanNullOptions, OptionsWrapper::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); From c98c665e0b3fb13bff32f646fcfb52da24c3914a Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 12:24:50 -0500 Subject: [PATCH 04/66] Add defaults() method to NanNullOptions class --- cpp/src/arrow/compute/api_scalar.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index f8c208f5506..5067699c531 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -213,6 +213,7 @@ class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: explicit NanNullOptions(bool nan_is_null); NanNullOptions(); + static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; bool nan_is_null; From c993efccda1f017a2a4bd1358189c30ef06500d0 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 13:33:24 -0500 Subject: [PATCH 05/66] Add implementation for IsNullOperator, Scalar case --- cpp/src/arrow/compute/api_scalar.cc | 1 - cpp/src/arrow/compute/api_scalar.h | 5 ++--- cpp/src/arrow/compute/kernels/scalar_validity.cc | 12 +++++------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index d0ba0357ad4..97572a53435 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -286,7 +286,6 @@ constexpr char DayOfWeekOptions::kTypeName[]; NanNullOptions::NanNullOptions(bool nan_is_null) : FunctionOptions(internal::kNanNullOptionsType), nan_is_null(nan_is_null) {} -NanNullOptions::NanNullOptions() : NanNullOptions(false) {} constexpr char NanNullOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 5067699c531..720ba67f0e9 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -211,8 +211,7 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: - explicit NanNullOptions(bool nan_is_null); - NanNullOptions(); + explicit NanNullOptions(bool nan_is_null = false); static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; @@ -744,7 +743,7 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, const NanNullOptions& options, ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, NanNullOptions options = NanNullOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index fd60d012a70..b73cafc1dbb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -77,13 +77,11 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - bool nan_is_null = false; - if (in.is_valid && in.ToString() == "nan") { - if (in.type == float16() || in.type == float32() || in.type == float64()) { - nan_is_null = true; - checked_cast(out)->value = nan_is_null; - return Status::OK(); - } + auto options = OptionsWrapper::Get(ctx); + bool is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + if (in.is_valid && is_floating(in.type->id()) && options.nan_is_null && is_nan_value) { + checked_cast(out)->value = true; + return Status::OK(); } checked_cast(out)->value = !in.is_valid; From da9fecc14ac481556fb79133758f338d5fa09c52 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 18:35:42 -0500 Subject: [PATCH 06/66] Fix test compilation issue --- cpp/src/arrow/compute/api_scalar.cc | 2 +- cpp/src/arrow/compute/api_scalar.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 97572a53435..0c1113d0c22 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -478,7 +478,7 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } -Result IsNull(const Datum& arg, const NanNullOptions& options, ExecContext* ctx) { +Result IsNull(const Datum& arg, NanNullOptions options, ExecContext* ctx) { return CallFunction("is_null", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 720ba67f0e9..35fcfa1716e 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -212,8 +212,8 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: explicit NanNullOptions(bool nan_is_null = false); - static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; + static NanNullOptions Defaults() { return NanNullOptions{}; } bool nan_is_null; }; From 7b6ca7821a536d36f2b442b3c646c1fa1307091a Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 23:15:06 -0500 Subject: [PATCH 07/66] Apply clang format, add tests for isnull --- cpp/src/arrow/compute/api_scalar.cc | 3 +- cpp/src/arrow/compute/api_scalar.h | 4 ++- .../arrow/compute/kernels/scalar_validity.cc | 29 ++++++++++++++----- .../compute/kernels/scalar_validity_test.cc | 14 +++++++++ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 0c1113d0c22..4bc6cf33ba2 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -284,8 +284,7 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start constexpr char DayOfWeekOptions::kTypeName[]; NanNullOptions::NanNullOptions(bool nan_is_null) - : FunctionOptions(internal::kNanNullOptionsType), - nan_is_null(nan_is_null) {} + : FunctionOptions(internal::kNanNullOptionsType), nan_is_null(nan_is_null) {} constexpr char NanNullOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 35fcfa1716e..57ed6d8b7d0 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -743,7 +743,9 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, NanNullOptions options = NanNullOptions::Defaults(), ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, + NanNullOptions options = NanNullOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index b73cafc1dbb..fa61074468a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -77,9 +77,18 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + bool is_nan_value = false; + bool is_floating = false; + if (in.type == float32()) { + is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + is_floating = true; + } else if (in.type == float64()) { + is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + is_floating = true; + } + auto options = OptionsWrapper::Get(ctx); - bool is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - if (in.is_valid && is_floating(in.type->id()) && options.nan_is_null && is_nan_value) { + if (in.is_valid && is_floating && options.nan_is_null && is_nan_value) { checked_cast(out)->value = true; return Status::OK(); } @@ -112,7 +121,9 @@ struct IsNanOperator { void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - MemAllocation::type mem_allocation, bool can_write_into_slices, const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR) { + MemAllocation::type mem_allocation, bool can_write_into_slices, + const FunctionOptions* default_options = NULLPTR, + KernelInit init = NULLPTR) { Arity arity{static_cast(in_types.size())}; auto func = std::make_shared(name, arity, doc, default_options); @@ -210,9 +221,12 @@ const FunctionDoc is_inf_doc( ("For each input value, emit true iff the value is infinite (inf or -inf)."), {"values"}); -const FunctionDoc is_null_doc("Return true if null, NaN can be considered as null", - ("For each input value, emit true iff the value is null. Default behavior is to emit false for NaN values. True can be emitted for NaN values by toggling NanNullOptions flag."), - {"values"}, "NanNullOptions"); +const FunctionDoc is_null_doc( + "Return true if null, NaN can be considered as null", + ("For each input value, emit true iff the value is null. Default behavior is to emit " + "false for NaN values. True can be emitted for NaN values by toggling " + "NanNullOptions flag."), + {"values"}, "NanNullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -228,7 +242,8 @@ void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true, &kNanNullOptions, OptionsWrapper::Init); + /*can_write_into_slices=*/true, &kNanNullOptions, + OptionsWrapper::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 1a7a1cbda15..e4b5698383b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -75,6 +75,20 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); + + // TODO: This tests could be helpful (scalar for now works well, but not ArrayData still + // not) By default 'NaN' value is not considered as null + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, false]")); + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, false]")); + + // Setting 'nan_is_null' as true, 'NaN value will be considered as null + const NanNullOptions& options = NanNullOptions(true); + // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), + // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only + // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { From e4adc7d571e7ff6910ded7534f3b67c1832ccbf9 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 23:20:51 -0500 Subject: [PATCH 08/66] Improve message for is_null tests --- cpp/src/arrow/compute/kernels/scalar_validity_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index e4b5698383b..1988cd5a200 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -76,8 +76,7 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); - // TODO: This tests could be helpful (scalar for now works well, but not ArrayData still - // not) By default 'NaN' value is not considered as null + // By default 'NaN' value is not considered as null CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), ArrayFromJSON(boolean(), "[true, false, false]")); CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), @@ -85,6 +84,7 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { // Setting 'nan_is_null' as true, 'NaN value will be considered as null const NanNullOptions& options = NanNullOptions(true); + // TODO: These tests could be helpful (Scalar for now works, but not ArrayData) // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), From 8bdfe99ee412b442c3771c946aa6ac162036a4b2 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 9 Aug 2021 16:26:55 -0500 Subject: [PATCH 09/66] Apply requested changes for IsNullOperator --- .../arrow/compute/kernels/scalar_validity.cc | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index fa61074468a..2cac6abeb8c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -77,23 +79,26 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - bool is_nan_value = false; - bool is_floating = false; - if (in.type == float32()) { - is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - is_floating = true; - } else if (in.type == float64()) { - is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - is_floating = true; - } - auto options = OptionsWrapper::Get(ctx); - if (in.is_valid && is_floating && options.nan_is_null && is_nan_value) { - checked_cast(out)->value = true; - return Status::OK(); + bool* out_value = &checked_cast(out)->value; + if (in.is_valid) { + switch (in.type->id()) { + case Type::FLOAT: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + case Type::DOUBLE: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + default: + *out_value = false; + break; + } + } else { + *out_value = true; } - checked_cast(out)->value = !in.is_valid; return Status::OK(); } @@ -234,8 +239,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From 685c0386570a0a16854f2c87ead35590f61a3730 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 9 Aug 2021 16:48:39 -0500 Subject: [PATCH 10/66] Remove default break, add todo to handle ArrayData for is_null --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 2cac6abeb8c..c11c5ef8e56 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -93,7 +93,6 @@ struct IsNullOperator { break; default: *out_value = false; - break; } } else { *out_value = true; @@ -103,6 +102,9 @@ struct IsNullOperator { } static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { + // TODO: Is `options` needed for detect nulls? Which is the better way to + // handle is_null for ArrayData + auto options = OptionsWrapper::Get(ctx); if (arr.MayHaveNulls()) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, From 05b458bba8df47e64b87f6741a5d88bd3963cf5a Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 10 Aug 2021 15:50:01 -0500 Subject: [PATCH 11/66] Apply SetBitsTo for NaN values when passed NanNullOptions --- .../arrow/compute/kernels/scalar_validity.cc | 26 +++++++++++++++---- .../compute/kernels/scalar_validity_test.cc | 12 ++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index c11c5ef8e56..9c3d479fff5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -102,9 +100,6 @@ struct IsNullOperator { } static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { - // TODO: Is `options` needed for detect nulls? Which is the better way to - // handle is_null for ArrayData - auto options = OptionsWrapper::Get(ctx); if (arr.MayHaveNulls()) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, @@ -114,6 +109,25 @@ struct IsNullOperator { BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, false); } + + auto options = OptionsWrapper::Get(ctx); + if (options.nan_is_null) { + if (arr.type->id() == Type::FLOAT) { + const float* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); + } + + } else if (arr.type->id() == Type::DOUBLE) { + const double* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); + } + } + } + return Status::OK(); } }; @@ -241,6 +255,8 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 1988cd5a200..c248f241719 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -82,13 +82,13 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), ArrayFromJSON(boolean(), "[true, false, false]")); - // Setting 'nan_is_null' as true, 'NaN value will be considered as null + // Setting 'nan_is_null' as true, 'NaN value could be considered as null (floating + // points) const NanNullOptions& options = NanNullOptions(true); - // TODO: These tests could be helpful (Scalar for now works, but not ArrayData) - // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), - // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only - // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[5.0, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[false, false, true]"), &options); + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, true]"), &options); } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { From c47838e1c23e4a61fa842fec519e5d1c1d096296 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 09:40:52 -0500 Subject: [PATCH 12/66] move kNanNullOptions to anonymous namespace --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 9c3d479fff5..f98d4ce85e8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -255,8 +257,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From 5f6e8dd2104d62ee94eeef49b1809ba282525da2 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 12:14:49 -0500 Subject: [PATCH 13/66] Fix for arrow-compute-expression-test (is_null) --- cpp/src/arrow/compute/exec/expression.cc | 5 ++++- cpp/src/arrow/compute/exec/expression.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index 67a9f3c40ff..34d21d6c87f 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -1154,7 +1154,10 @@ Expression greater_equal(Expression lhs, Expression rhs) { return call("greater_equal", {std::move(lhs), std::move(rhs)}); } -Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); } +Expression is_null(Expression lhs, bool nan_is_null) { + return call("is_null", {std::move(lhs)}, + compute::NanNullOptions(std::move(nan_is_null))); +} Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); } diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index 3810accf70a..dac5728ab46 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -255,7 +255,7 @@ ARROW_EXPORT Expression greater(Expression lhs, Expression rhs); ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs); -ARROW_EXPORT Expression is_null(Expression lhs); +ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false); ARROW_EXPORT Expression is_valid(Expression lhs); From d91d64f97382b455480761ba6928ddaa2953ee4a Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 15:34:33 -0500 Subject: [PATCH 14/66] Add bindings in cython --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 ++-- python/pyarrow/_compute.pyx | 6 ++++++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 +++++ python/pyarrow/tests/test_compute.py | 1 + 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index f98d4ce85e8..9c3d479fff5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -257,6 +255,8 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 2d82928f377..516e0cddf44 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,12 +796,18 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) +cdef class _NanNullOptions(FunctionOptions): + def _set_options(self, bool nan_is_null): + self.wrapped.reset(new CNanNullOptions(nan_is_null)) class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) +class NanNullOptions(_NanNullOptions): + def __init__(self, bool nan_is_null): + self._set_options(nan_is_null) cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 10880c2974c..63ff6d92a3b 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -49,6 +49,7 @@ ScalarAggregateOptions, SetLookupOptions, SliceOptions, + NanNullOptions, SortOptions, SplitOptions, SplitPatternOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7dcde652a95..2eed779ec89 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1854,6 +1854,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int64_t start int64_t stop int64_t step + + cdef cppclass CNanNullOptions \ + "arrow::compute::NanNullOptions"(CFunctionOptions): + CNanNullOptions(bool nan_is_null) + bool nan_is_null cdef cppclass CSplitOptions \ "arrow::compute::SplitOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 60a2f60f942..f3904da4fc0 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -125,6 +125,7 @@ def test_option_class_equality(): pc.ReplaceSubstringOptions("a", "b"), pc.SetLookupOptions(value_set=pa.array([1])), pc.SliceOptions(start=0, stop=1, step=1), + pc.NanNullOptions(nan_is_null=False), pc.SplitPatternOptions(pattern="pattern"), pc.StrptimeOptions("%Y", "s"), pc.TrimOptions(" "), From 41515f1c217e5701cec645c563c13e19589acf2a Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 19:48:59 -0500 Subject: [PATCH 15/66] Add specialized tests for is_null --- cpp/src/arrow/compute/api_scalar.h | 2 +- .../arrow/compute/kernels/scalar_validity.cc | 8 +- .../compute/kernels/scalar_validity_test.cc | 101 +++++++++++++++--- 3 files changed, 90 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 57ed6d8b7d0..9dc084f5066 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -215,7 +215,7 @@ class ARROW_EXPORT NanNullOptions : public FunctionOptions { constexpr static char const kTypeName[] = "NanNullOptions"; static NanNullOptions Defaults() { return NanNullOptions{}; } - bool nan_is_null; + bool nan_is_null = false; }; enum CompareOperator : int8_t { diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 9c3d479fff5..09a468fcd3b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -77,7 +79,7 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - auto options = OptionsWrapper::Get(ctx); + const NanNullOptions& options = OptionsWrapper::Get(ctx); bool* out_value = &checked_cast(out)->value; if (in.is_valid) { switch (in.type->id()) { @@ -110,7 +112,7 @@ struct IsNullOperator { false); } - auto options = OptionsWrapper::Get(ctx); + const NanNullOptions& options = OptionsWrapper::Get(ctx); if (options.nan_is_null) { if (arr.type->id() == Type::FLOAT) { const float* data = arr.GetValues(1); @@ -255,8 +257,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index c248f241719..c23ce987b79 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -75,20 +75,94 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); +} - // By default 'NaN' value is not considered as null - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, false]")); - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, false]")); +TEST_F(TestFloatValidityKernels, FloatArrayIsNull) { + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); +} - // Setting 'nan_is_null' as true, 'NaN value could be considered as null (floating - // points) +TEST_F(TestFloatValidityKernels, FloatArrayIsNullUsingNanNullOptions) { const NanNullOptions& options = NanNullOptions(true); - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[5.0, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[false, false, true]"), &options); - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, true]"), &options); + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), + &options); +} + +TEST_F(TestDoubleValidityKernels, DoubleArrayIsNull) { + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); +} + +TEST_F(TestDoubleValidityKernels, DoubleArrayIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), + &options); +} + +TEST_F(TestFloatValidityKernels, FloatScalarIsNull) { + CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false)); + CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true)); +} + +TEST_F(TestFloatValidityKernels, FloatScalarIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(true), &options); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options); +} + +TEST_F(TestDoubleValidityKernels, DoubleScalarIsNull) { + CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false)); + CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); +} + +TEST_F(TestDoubleValidityKernels, DoubleScalarIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(true), &options); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true), &options); } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { @@ -97,11 +171,6 @@ TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { ASSERT_EQ(result->null_count, 0); } -TEST_F(TestBooleanValidityKernels, ScalarIsNull) { - CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false)); - CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); -} - TEST_F(TestFloatValidityKernels, FloatArrayIsFinite) { // All Inf CheckScalarUnary("is_finite", ArrayFromJSON(float32(), "[Inf, -Inf, Inf, -Inf, Inf]"), From 918539faaa82c9d1d5d2cd3fdbfcaa6c94b49dc1 Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 00:15:10 -0500 Subject: [PATCH 16/66] Fix Sanitizer for KNanNullOptions --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 09a468fcd3b..59f5cdb00a5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -258,6 +256,7 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace void RegisterScalarValidity(FunctionRegistry* registry) { + static auto kNanNullOptions = NanNullOptions::Defaults(); MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From 527ad789bda7ba950a774e0b0211ff61ae758cc3 Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 00:25:02 -0500 Subject: [PATCH 17/66] Remove cython bindigs --- python/pyarrow/_compute.pyx | 8 -------- python/pyarrow/compute.py | 1 - python/pyarrow/includes/libarrow.pxd | 5 ----- python/pyarrow/tests/test_compute.py | 1 - 4 files changed, 15 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 516e0cddf44..aa6e5cfde94 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,19 +796,11 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) -cdef class _NanNullOptions(FunctionOptions): - def _set_options(self, bool nan_is_null): - self.wrapped.reset(new CNanNullOptions(nan_is_null)) - class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) -class NanNullOptions(_NanNullOptions): - def __init__(self, bool nan_is_null): - self._set_options(nan_is_null) - cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): if null_selection_behavior == 'drop': diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 63ff6d92a3b..10880c2974c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -49,7 +49,6 @@ ScalarAggregateOptions, SetLookupOptions, SliceOptions, - NanNullOptions, SortOptions, SplitOptions, SplitPatternOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2eed779ec89..7dcde652a95 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1854,11 +1854,6 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int64_t start int64_t stop int64_t step - - cdef cppclass CNanNullOptions \ - "arrow::compute::NanNullOptions"(CFunctionOptions): - CNanNullOptions(bool nan_is_null) - bool nan_is_null cdef cppclass CSplitOptions \ "arrow::compute::SplitOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index f3904da4fc0..60a2f60f942 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -125,7 +125,6 @@ def test_option_class_equality(): pc.ReplaceSubstringOptions("a", "b"), pc.SetLookupOptions(value_set=pa.array([1])), pc.SliceOptions(start=0, stop=1, step=1), - pc.NanNullOptions(nan_is_null=False), pc.SplitPatternOptions(pattern="pattern"), pc.StrptimeOptions("%Y", "s"), pc.TrimOptions(" "), From 7847bc3ecc01200104d9e478d2522dad885bebb9 Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 17:47:03 -0500 Subject: [PATCH 18/66] binding in cython layer, fix python tests (theorically) --- python/pyarrow/_compute.pyx | 14 ++++++++++++++ python/pyarrow/_dataset.pyx | 8 ++++++-- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 +++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index aa6e5cfde94..aeadaaac6a6 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,11 +796,13 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) + class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) + cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): if null_selection_behavior == 'drop': @@ -991,6 +993,18 @@ class DayOfWeekOptions(_DayOfWeekOptions): self._set_options(one_based_numbering, week_start) +cdef class _NanNullOptions(FunctionOptions): + def _set_options(self, nan_is_null): + self.wrapped.reset( + new CNanNullOptions(nan_is_null) + ) + + +class NanNullOptions(_NanNullOptions): + def __init__(self, nan_is_null=False): + self._set_options(nan_is_null) + + cdef class _VarianceOptions(FunctionOptions): def _set_options(self, ddof): self.wrapped.reset(new CVarianceOptions(ddof)) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 945475bd7f1..f2aecc7d60d 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -234,9 +234,13 @@ cdef class Expression(_Weakrefable): """Checks whether the expression is not-null (valid)""" return Expression._call("is_valid", [self]) - def is_null(self): + def is_null(self, bint nan_is_null=False): """Checks whether the expression is null""" - return Expression._call("is_null", [self]) + cdef: + shared_ptr[CFunctionOptions] c_options + + c_options.reset(new CNanNullOptions(nan_is_null)) + return Expression._call("is_null", [self], c_options) def cast(self, type, bint safe=True): """Explicitly change the expression's data type""" diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 10880c2974c..4efa9fbfc41 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -54,6 +54,7 @@ SplitPatternOptions, StrptimeOptions, DayOfWeekOptions, + NanNullOptions, TakeOptions, TDigestOptions, TrimOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7dcde652a95..e38a5d6f9f4 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1948,6 +1948,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: c_bool one_based_numbering uint32_t week_start + cdef cppclass CNanNullOptions \ + "arrow::compute::NanNullOptions"(CFunctionOptions): + CNanNullOptions(c_bool nan_is_null) + c_bool nan_is_null + cdef cppclass CVarianceOptions \ "arrow::compute::VarianceOptions"(CFunctionOptions): CVarianceOptions(int ddof) From 4cf4e4ba2d2ad834c40c7fe593c2db311cae8c64 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 11:23:24 -0500 Subject: [PATCH 19/66] Add NanNullOptions for IsNull kernel --- cpp/src/arrow/compute/api_scalar.cc | 5 ++++- cpp/src/arrow/compute/api_scalar.h | 12 +++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f9c95a7745b..7439cefe003 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -452,7 +452,6 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti // Validity functions SCALAR_EAGER_UNARY(IsValid, "is_valid") -SCALAR_EAGER_UNARY(IsNull, "is_null") SCALAR_EAGER_UNARY(IsNan, "is_nan") Result FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) { @@ -472,6 +471,10 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } +Result IsNull(const Datum& arg, const NanNullOptions& options, ExecContext* ctx) { + return CallFunction("is_null", {arg}, &options, ctx); +} + // ---------------------------------------------------------------------- // Temporal functions diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 115cad13d6f..fe3648f473e 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -209,6 +209,15 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { int64_t start, stop, step; }; +class ARROW_EXPORT NanNullOptions : public FunctionOptions { + public: + explicit NanNullOptions(bool nan_is_null); + NanNullOptions(); + constexpr static char const kTypeName[] = "NanNullOptions"; + + bool nan_is_null; +}; + enum CompareOperator : int8_t { EQUAL, NOT_EQUAL, @@ -741,13 +750,14 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// false otherwise /// /// \param[in] values input to examine for nullity +/// \param[in] options NanNullOptions /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, const NanNullOptions& options, ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise From 8b0c427d341362870723367ab9de6b0ee2685b4e Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 11:31:37 -0500 Subject: [PATCH 20/66] Add kNanNullOptionsType to RegisterScalarOptions --- cpp/src/arrow/compute/api_scalar.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 7439cefe003..25818cd27eb 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -159,6 +159,8 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( static auto kDayOfWeekOptionsType = GetFunctionOptionsType( DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering), DataMember("week_start", &DayOfWeekOptions::week_start)); +static auto kNanNullOptionsType = GetFunctionOptionsType( + DataMember("nan_is_null", &NanNullOptions::nan_is_null)); } // namespace } // namespace internal @@ -281,6 +283,12 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start week_start(week_start) {} constexpr char DayOfWeekOptions::kTypeName[]; +NanNullOptions::NanNullOptions(bool nan_is_null) + : FunctionOptions(internal::kNanNullOptionsType), + nan_is_null(nan_is_null) {} +NanNullOptions::NanNullOptions() : NanNullOptions(false) {} +constexpr char NanNullOptions::kTypeName[]; + namespace internal { void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType)); @@ -299,6 +307,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kNanNullOptionsType)); } } // namespace internal From 59625b3abfb556df23af1b2457142b544c05b221 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 12:18:58 -0500 Subject: [PATCH 21/66] Add Init param for MafeFunction and docs for is_null kernel --- .../arrow/compute/kernels/scalar_validity.cc | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index ead88abc0f2..fd60d012a70 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/util/bit_util.h" @@ -76,6 +77,15 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + bool nan_is_null = false; + if (in.is_valid && in.ToString() == "nan") { + if (in.type == float16() || in.type == float32() || in.type == float64()) { + nan_is_null = true; + checked_cast(out)->value = nan_is_null; + return Status::OK(); + } + } + checked_cast(out)->value = !in.is_valid; return Status::OK(); } @@ -104,11 +114,11 @@ struct IsNanOperator { void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - MemAllocation::type mem_allocation, bool can_write_into_slices) { + MemAllocation::type mem_allocation, bool can_write_into_slices, const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR) { Arity arity{static_cast(in_types.size())}; - auto func = std::make_shared(name, arity, doc); + auto func = std::make_shared(name, arity, doc, default_options); - ScalarKernel kernel(std::move(in_types), out_type, exec); + ScalarKernel kernel(std::move(in_types), out_type, exec, init); kernel.null_handling = NullHandling::OUTPUT_NOT_NULL; kernel.can_write_into_slices = can_write_into_slices; kernel.mem_allocation = mem_allocation; @@ -202,9 +212,9 @@ const FunctionDoc is_inf_doc( ("For each input value, emit true iff the value is infinite (inf or -inf)."), {"values"}); -const FunctionDoc is_null_doc("Return true if null", - ("For each input value, emit true iff the value is null."), - {"values"}); +const FunctionDoc is_null_doc("Return true if null, NaN can be considered as null", + ("For each input value, emit true iff the value is null. Default behavior is to emit false for NaN values. True can be emitted for NaN values by toggling NanNullOptions flag."), + {"values"}, "NanNullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -212,13 +222,15 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true); + /*can_write_into_slices=*/true, &kNanNullOptions, OptionsWrapper::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); From 3f91336a63e2ab8f17201682300ee6b26bbde5f8 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 12:24:50 -0500 Subject: [PATCH 22/66] Add defaults() method to NanNullOptions class --- cpp/src/arrow/compute/api_scalar.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index fe3648f473e..fdec9d655c5 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -213,6 +213,7 @@ class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: explicit NanNullOptions(bool nan_is_null); NanNullOptions(); + static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; bool nan_is_null; From 438a4efa79717a452fda8ede16d2f40e7aa2a25e Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 13:33:24 -0500 Subject: [PATCH 23/66] Add implementation for IsNullOperator, Scalar case --- cpp/src/arrow/compute/api_scalar.cc | 1 - cpp/src/arrow/compute/api_scalar.h | 5 ++--- cpp/src/arrow/compute/kernels/scalar_validity.cc | 12 +++++------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 25818cd27eb..1cff0b702a9 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -286,7 +286,6 @@ constexpr char DayOfWeekOptions::kTypeName[]; NanNullOptions::NanNullOptions(bool nan_is_null) : FunctionOptions(internal::kNanNullOptionsType), nan_is_null(nan_is_null) {} -NanNullOptions::NanNullOptions() : NanNullOptions(false) {} constexpr char NanNullOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index fdec9d655c5..d9ac5838f26 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -211,8 +211,7 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: - explicit NanNullOptions(bool nan_is_null); - NanNullOptions(); + explicit NanNullOptions(bool nan_is_null = false); static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; @@ -758,7 +757,7 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, const NanNullOptions& options, ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, NanNullOptions options = NanNullOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index fd60d012a70..b73cafc1dbb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -77,13 +77,11 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - bool nan_is_null = false; - if (in.is_valid && in.ToString() == "nan") { - if (in.type == float16() || in.type == float32() || in.type == float64()) { - nan_is_null = true; - checked_cast(out)->value = nan_is_null; - return Status::OK(); - } + auto options = OptionsWrapper::Get(ctx); + bool is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + if (in.is_valid && is_floating(in.type->id()) && options.nan_is_null && is_nan_value) { + checked_cast(out)->value = true; + return Status::OK(); } checked_cast(out)->value = !in.is_valid; From b843c0a196c23efb8439bc15ac2ca864c9e5869d Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 18:35:42 -0500 Subject: [PATCH 24/66] Fix test compilation issue --- cpp/src/arrow/compute/api_scalar.cc | 2 +- cpp/src/arrow/compute/api_scalar.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 1cff0b702a9..335a255d785 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -479,7 +479,7 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } -Result IsNull(const Datum& arg, const NanNullOptions& options, ExecContext* ctx) { +Result IsNull(const Datum& arg, NanNullOptions options, ExecContext* ctx) { return CallFunction("is_null", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index d9ac5838f26..8dc4e995166 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -212,8 +212,8 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: explicit NanNullOptions(bool nan_is_null = false); - static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; + static NanNullOptions Defaults() { return NanNullOptions{}; } bool nan_is_null; }; From 2c57c77d8a138a52adfcfb9cc620fba87b969c29 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 23:15:06 -0500 Subject: [PATCH 25/66] Apply clang format, add tests for isnull --- cpp/src/arrow/compute/api_scalar.cc | 3 +- cpp/src/arrow/compute/api_scalar.h | 4 ++- .../arrow/compute/kernels/scalar_validity.cc | 29 ++++++++++++++----- .../compute/kernels/scalar_validity_test.cc | 14 +++++++++ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 335a255d785..24c1dff6b2b 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -284,8 +284,7 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start constexpr char DayOfWeekOptions::kTypeName[]; NanNullOptions::NanNullOptions(bool nan_is_null) - : FunctionOptions(internal::kNanNullOptionsType), - nan_is_null(nan_is_null) {} + : FunctionOptions(internal::kNanNullOptionsType), nan_is_null(nan_is_null) {} constexpr char NanNullOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 8dc4e995166..c9fb36d371f 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -757,7 +757,9 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, NanNullOptions options = NanNullOptions::Defaults(), ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, + NanNullOptions options = NanNullOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index b73cafc1dbb..fa61074468a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -77,9 +77,18 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + bool is_nan_value = false; + bool is_floating = false; + if (in.type == float32()) { + is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + is_floating = true; + } else if (in.type == float64()) { + is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + is_floating = true; + } + auto options = OptionsWrapper::Get(ctx); - bool is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - if (in.is_valid && is_floating(in.type->id()) && options.nan_is_null && is_nan_value) { + if (in.is_valid && is_floating && options.nan_is_null && is_nan_value) { checked_cast(out)->value = true; return Status::OK(); } @@ -112,7 +121,9 @@ struct IsNanOperator { void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - MemAllocation::type mem_allocation, bool can_write_into_slices, const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR) { + MemAllocation::type mem_allocation, bool can_write_into_slices, + const FunctionOptions* default_options = NULLPTR, + KernelInit init = NULLPTR) { Arity arity{static_cast(in_types.size())}; auto func = std::make_shared(name, arity, doc, default_options); @@ -210,9 +221,12 @@ const FunctionDoc is_inf_doc( ("For each input value, emit true iff the value is infinite (inf or -inf)."), {"values"}); -const FunctionDoc is_null_doc("Return true if null, NaN can be considered as null", - ("For each input value, emit true iff the value is null. Default behavior is to emit false for NaN values. True can be emitted for NaN values by toggling NanNullOptions flag."), - {"values"}, "NanNullOptions"); +const FunctionDoc is_null_doc( + "Return true if null, NaN can be considered as null", + ("For each input value, emit true iff the value is null. Default behavior is to emit " + "false for NaN values. True can be emitted for NaN values by toggling " + "NanNullOptions flag."), + {"values"}, "NanNullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -228,7 +242,8 @@ void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true, &kNanNullOptions, OptionsWrapper::Init); + /*can_write_into_slices=*/true, &kNanNullOptions, + OptionsWrapper::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 1a7a1cbda15..e4b5698383b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -75,6 +75,20 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); + + // TODO: This tests could be helpful (scalar for now works well, but not ArrayData still + // not) By default 'NaN' value is not considered as null + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, false]")); + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, false]")); + + // Setting 'nan_is_null' as true, 'NaN value will be considered as null + const NanNullOptions& options = NanNullOptions(true); + // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), + // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only + // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { From b6716e0dc711ecf0ab83ef1d17c583b77128745b Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 23:20:51 -0500 Subject: [PATCH 26/66] Improve message for is_null tests --- cpp/src/arrow/compute/kernels/scalar_validity_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index e4b5698383b..1988cd5a200 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -76,8 +76,7 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); - // TODO: This tests could be helpful (scalar for now works well, but not ArrayData still - // not) By default 'NaN' value is not considered as null + // By default 'NaN' value is not considered as null CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), ArrayFromJSON(boolean(), "[true, false, false]")); CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), @@ -85,6 +84,7 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { // Setting 'nan_is_null' as true, 'NaN value will be considered as null const NanNullOptions& options = NanNullOptions(true); + // TODO: These tests could be helpful (Scalar for now works, but not ArrayData) // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), From 7e02c372ac07e6b4e1b34792cba978969346b873 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 9 Aug 2021 16:26:55 -0500 Subject: [PATCH 27/66] Apply requested changes for IsNullOperator --- .../arrow/compute/kernels/scalar_validity.cc | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index fa61074468a..2cac6abeb8c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -77,23 +79,26 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - bool is_nan_value = false; - bool is_floating = false; - if (in.type == float32()) { - is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - is_floating = true; - } else if (in.type == float64()) { - is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - is_floating = true; - } - auto options = OptionsWrapper::Get(ctx); - if (in.is_valid && is_floating && options.nan_is_null && is_nan_value) { - checked_cast(out)->value = true; - return Status::OK(); + bool* out_value = &checked_cast(out)->value; + if (in.is_valid) { + switch (in.type->id()) { + case Type::FLOAT: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + case Type::DOUBLE: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + default: + *out_value = false; + break; + } + } else { + *out_value = true; } - checked_cast(out)->value = !in.is_valid; return Status::OK(); } @@ -234,8 +239,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From 083d4a6269c06011d48d4a6b3060c0e284e74055 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 9 Aug 2021 16:48:39 -0500 Subject: [PATCH 28/66] Remove default break, add todo to handle ArrayData for is_null --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 2cac6abeb8c..c11c5ef8e56 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -93,7 +93,6 @@ struct IsNullOperator { break; default: *out_value = false; - break; } } else { *out_value = true; @@ -103,6 +102,9 @@ struct IsNullOperator { } static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { + // TODO: Is `options` needed for detect nulls? Which is the better way to + // handle is_null for ArrayData + auto options = OptionsWrapper::Get(ctx); if (arr.MayHaveNulls()) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, From 102921f04d3f76362ad4a63153d6a479f67ce6c6 Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 10 Aug 2021 15:50:01 -0500 Subject: [PATCH 29/66] Apply SetBitsTo for NaN values when passed NanNullOptions --- .../arrow/compute/kernels/scalar_validity.cc | 26 +++++++++++++++---- .../compute/kernels/scalar_validity_test.cc | 12 ++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index c11c5ef8e56..9c3d479fff5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -102,9 +100,6 @@ struct IsNullOperator { } static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { - // TODO: Is `options` needed for detect nulls? Which is the better way to - // handle is_null for ArrayData - auto options = OptionsWrapper::Get(ctx); if (arr.MayHaveNulls()) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, @@ -114,6 +109,25 @@ struct IsNullOperator { BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, false); } + + auto options = OptionsWrapper::Get(ctx); + if (options.nan_is_null) { + if (arr.type->id() == Type::FLOAT) { + const float* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); + } + + } else if (arr.type->id() == Type::DOUBLE) { + const double* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); + } + } + } + return Status::OK(); } }; @@ -241,6 +255,8 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 1988cd5a200..c248f241719 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -82,13 +82,13 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), ArrayFromJSON(boolean(), "[true, false, false]")); - // Setting 'nan_is_null' as true, 'NaN value will be considered as null + // Setting 'nan_is_null' as true, 'NaN value could be considered as null (floating + // points) const NanNullOptions& options = NanNullOptions(true); - // TODO: These tests could be helpful (Scalar for now works, but not ArrayData) - // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), - // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only - // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[5.0, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[false, false, true]"), &options); + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, true]"), &options); } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { From a9e3229c8a2161095ce6e1fde67c2b64d92c723c Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 09:40:52 -0500 Subject: [PATCH 30/66] move kNanNullOptions to anonymous namespace --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 9c3d479fff5..f98d4ce85e8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -255,8 +257,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From 433a8c4e247205675d5659172830ac31c61cb0c8 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 12:14:49 -0500 Subject: [PATCH 31/66] Fix for arrow-compute-expression-test (is_null) --- cpp/src/arrow/compute/exec/expression.cc | 5 ++++- cpp/src/arrow/compute/exec/expression.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index 67a9f3c40ff..34d21d6c87f 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -1154,7 +1154,10 @@ Expression greater_equal(Expression lhs, Expression rhs) { return call("greater_equal", {std::move(lhs), std::move(rhs)}); } -Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); } +Expression is_null(Expression lhs, bool nan_is_null) { + return call("is_null", {std::move(lhs)}, + compute::NanNullOptions(std::move(nan_is_null))); +} Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); } diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index 3810accf70a..dac5728ab46 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -255,7 +255,7 @@ ARROW_EXPORT Expression greater(Expression lhs, Expression rhs); ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs); -ARROW_EXPORT Expression is_null(Expression lhs); +ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false); ARROW_EXPORT Expression is_valid(Expression lhs); From 71cfbfe60999bc9947c53fd6801911653f6760d2 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 15:34:33 -0500 Subject: [PATCH 32/66] Add bindings in cython --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 ++-- python/pyarrow/_compute.pyx | 6 ++++++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 +++++ python/pyarrow/tests/test_compute.py | 1 + 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index f98d4ce85e8..9c3d479fff5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -257,6 +255,8 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 2d82928f377..516e0cddf44 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,12 +796,18 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) +cdef class _NanNullOptions(FunctionOptions): + def _set_options(self, bool nan_is_null): + self.wrapped.reset(new CNanNullOptions(nan_is_null)) class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) +class NanNullOptions(_NanNullOptions): + def __init__(self, bool nan_is_null): + self._set_options(nan_is_null) cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 10880c2974c..63ff6d92a3b 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -49,6 +49,7 @@ ScalarAggregateOptions, SetLookupOptions, SliceOptions, + NanNullOptions, SortOptions, SplitOptions, SplitPatternOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9260cd28f85..1566c289a13 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1863,6 +1863,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int64_t start int64_t stop int64_t step + + cdef cppclass CNanNullOptions \ + "arrow::compute::NanNullOptions"(CFunctionOptions): + CNanNullOptions(bool nan_is_null) + bool nan_is_null cdef cppclass CSplitOptions \ "arrow::compute::SplitOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 26d1f0e647c..edaa1a081e2 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -126,6 +126,7 @@ def test_option_class_equality(): pc.ReplaceSubstringOptions("a", "b"), pc.SetLookupOptions(value_set=pa.array([1])), pc.SliceOptions(start=0, stop=1, step=1), + pc.NanNullOptions(nan_is_null=False), pc.SplitPatternOptions(pattern="pattern"), pc.StrptimeOptions("%Y", "s"), pc.TrimOptions(" "), From aef53d0604858f104095c403a5c618e1bdd5adb5 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 19:48:59 -0500 Subject: [PATCH 33/66] Add specialized tests for is_null --- cpp/src/arrow/compute/api_scalar.h | 2 +- .../arrow/compute/kernels/scalar_validity.cc | 8 +- .../compute/kernels/scalar_validity_test.cc | 101 +++++++++++++++--- 3 files changed, 90 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index c9fb36d371f..46dba2514dc 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -215,7 +215,7 @@ class ARROW_EXPORT NanNullOptions : public FunctionOptions { constexpr static char const kTypeName[] = "NanNullOptions"; static NanNullOptions Defaults() { return NanNullOptions{}; } - bool nan_is_null; + bool nan_is_null = false; }; enum CompareOperator : int8_t { diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 9c3d479fff5..09a468fcd3b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -77,7 +79,7 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - auto options = OptionsWrapper::Get(ctx); + const NanNullOptions& options = OptionsWrapper::Get(ctx); bool* out_value = &checked_cast(out)->value; if (in.is_valid) { switch (in.type->id()) { @@ -110,7 +112,7 @@ struct IsNullOperator { false); } - auto options = OptionsWrapper::Get(ctx); + const NanNullOptions& options = OptionsWrapper::Get(ctx); if (options.nan_is_null) { if (arr.type->id() == Type::FLOAT) { const float* data = arr.GetValues(1); @@ -255,8 +257,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index c248f241719..c23ce987b79 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -75,20 +75,94 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); +} - // By default 'NaN' value is not considered as null - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, false]")); - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, false]")); +TEST_F(TestFloatValidityKernels, FloatArrayIsNull) { + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); +} - // Setting 'nan_is_null' as true, 'NaN value could be considered as null (floating - // points) +TEST_F(TestFloatValidityKernels, FloatArrayIsNullUsingNanNullOptions) { const NanNullOptions& options = NanNullOptions(true); - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[5.0, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[false, false, true]"), &options); - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, true]"), &options); + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), + &options); +} + +TEST_F(TestDoubleValidityKernels, DoubleArrayIsNull) { + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); +} + +TEST_F(TestDoubleValidityKernels, DoubleArrayIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), + &options); +} + +TEST_F(TestFloatValidityKernels, FloatScalarIsNull) { + CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false)); + CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true)); +} + +TEST_F(TestFloatValidityKernels, FloatScalarIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(true), &options); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options); +} + +TEST_F(TestDoubleValidityKernels, DoubleScalarIsNull) { + CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false)); + CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); +} + +TEST_F(TestDoubleValidityKernels, DoubleScalarIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(true), &options); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true), &options); } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { @@ -97,11 +171,6 @@ TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { ASSERT_EQ(result->null_count, 0); } -TEST_F(TestBooleanValidityKernels, ScalarIsNull) { - CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false)); - CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); -} - TEST_F(TestFloatValidityKernels, FloatArrayIsFinite) { // All Inf CheckScalarUnary("is_finite", ArrayFromJSON(float32(), "[Inf, -Inf, Inf, -Inf, Inf]"), From 57a7d3ef217c77de04c6f137fdff47d7f82fc1d7 Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 00:15:10 -0500 Subject: [PATCH 34/66] Fix Sanitizer for KNanNullOptions --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 09a468fcd3b..59f5cdb00a5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -258,6 +256,7 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace void RegisterScalarValidity(FunctionRegistry* registry) { + static auto kNanNullOptions = NanNullOptions::Defaults(); MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From b8582c7348d8d95dfcdae5207651a95f54d00e9a Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 00:25:02 -0500 Subject: [PATCH 35/66] Remove cython bindigs --- python/pyarrow/_compute.pyx | 8 -------- python/pyarrow/compute.py | 1 - python/pyarrow/includes/libarrow.pxd | 5 ----- python/pyarrow/tests/test_compute.py | 1 - 4 files changed, 15 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 516e0cddf44..aa6e5cfde94 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,19 +796,11 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) -cdef class _NanNullOptions(FunctionOptions): - def _set_options(self, bool nan_is_null): - self.wrapped.reset(new CNanNullOptions(nan_is_null)) - class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) -class NanNullOptions(_NanNullOptions): - def __init__(self, bool nan_is_null): - self._set_options(nan_is_null) - cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): if null_selection_behavior == 'drop': diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 63ff6d92a3b..10880c2974c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -49,7 +49,6 @@ ScalarAggregateOptions, SetLookupOptions, SliceOptions, - NanNullOptions, SortOptions, SplitOptions, SplitPatternOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 1566c289a13..9260cd28f85 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1863,11 +1863,6 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int64_t start int64_t stop int64_t step - - cdef cppclass CNanNullOptions \ - "arrow::compute::NanNullOptions"(CFunctionOptions): - CNanNullOptions(bool nan_is_null) - bool nan_is_null cdef cppclass CSplitOptions \ "arrow::compute::SplitOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index edaa1a081e2..26d1f0e647c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -126,7 +126,6 @@ def test_option_class_equality(): pc.ReplaceSubstringOptions("a", "b"), pc.SetLookupOptions(value_set=pa.array([1])), pc.SliceOptions(start=0, stop=1, step=1), - pc.NanNullOptions(nan_is_null=False), pc.SplitPatternOptions(pattern="pattern"), pc.StrptimeOptions("%Y", "s"), pc.TrimOptions(" "), From 4197bcc70e0bf23338687af21a7ab8a1e58c4683 Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 17:47:03 -0500 Subject: [PATCH 36/66] binding in cython layer, fix python tests (theorically) --- python/pyarrow/_compute.pyx | 14 ++++++++++++++ python/pyarrow/_dataset.pyx | 8 ++++++-- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 +++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index aa6e5cfde94..aeadaaac6a6 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,11 +796,13 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) + class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) + cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): if null_selection_behavior == 'drop': @@ -991,6 +993,18 @@ class DayOfWeekOptions(_DayOfWeekOptions): self._set_options(one_based_numbering, week_start) +cdef class _NanNullOptions(FunctionOptions): + def _set_options(self, nan_is_null): + self.wrapped.reset( + new CNanNullOptions(nan_is_null) + ) + + +class NanNullOptions(_NanNullOptions): + def __init__(self, nan_is_null=False): + self._set_options(nan_is_null) + + cdef class _VarianceOptions(FunctionOptions): def _set_options(self, ddof): self.wrapped.reset(new CVarianceOptions(ddof)) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 945475bd7f1..f2aecc7d60d 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -234,9 +234,13 @@ cdef class Expression(_Weakrefable): """Checks whether the expression is not-null (valid)""" return Expression._call("is_valid", [self]) - def is_null(self): + def is_null(self, bint nan_is_null=False): """Checks whether the expression is null""" - return Expression._call("is_null", [self]) + cdef: + shared_ptr[CFunctionOptions] c_options + + c_options.reset(new CNanNullOptions(nan_is_null)) + return Expression._call("is_null", [self], c_options) def cast(self, type, bint safe=True): """Explicitly change the expression's data type""" diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 10880c2974c..4efa9fbfc41 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -54,6 +54,7 @@ SplitPatternOptions, StrptimeOptions, DayOfWeekOptions, + NanNullOptions, TakeOptions, TDigestOptions, TrimOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9260cd28f85..66cf7e9a460 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1957,6 +1957,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: c_bool one_based_numbering uint32_t week_start + cdef cppclass CNanNullOptions \ + "arrow::compute::NanNullOptions"(CFunctionOptions): + CNanNullOptions(c_bool nan_is_null) + c_bool nan_is_null + cdef cppclass CVarianceOptions \ "arrow::compute::VarianceOptions"(CFunctionOptions): CVarianceOptions(int ddof) From 42943b90bb9ad4b35c4f263af041d6824b6a0635 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 17 Aug 2021 17:05:57 -0400 Subject: [PATCH 37/66] reworked kernel implementation --- .../arrow/compute/kernels/scalar_validity.cc | 68 ++++++++++++------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 59f5cdb00a5..91706189d82 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,9 @@ namespace compute { namespace internal { namespace { +template +using enable_if_floating_point = enable_if_t::value, R>; + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -77,20 +80,24 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - const NanNullOptions& options = OptionsWrapper::Get(ctx); + const auto options = OptionsWrapper::Get(ctx); bool* out_value = &checked_cast(out)->value; if (in.is_valid) { - switch (in.type->id()) { - case Type::FLOAT: - *out_value = options.nan_is_null && - std::isnan(internal::UnboxScalar::Unbox(in)); - break; - case Type::DOUBLE: - *out_value = options.nan_is_null && - std::isnan(internal::UnboxScalar::Unbox(in)); - break; - default: - *out_value = false; + if (is_floating(in.type->id())) { + switch (in.type->id()) { + case Type::FLOAT: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + case Type::DOUBLE: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + default: + return Status::NotImplemented("Type not implemented"); + } + } else { + *out_value = false; } } else { *out_value = true; @@ -99,6 +106,17 @@ struct IsNullOperator { return Status::OK(); } + template + static enable_if_floating_point SetNanBits(const ArrayData& arr, + ArrayData* out) { + const T* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) { + BitUtil::SetBit(out->buffers[1]->mutable_data(), i); + } + } + } + static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { if (arr.MayHaveNulls()) { // Input has nulls => output is the inverted null (validity) bitmap. @@ -110,20 +128,18 @@ struct IsNullOperator { false); } - const NanNullOptions& options = OptionsWrapper::Get(ctx); - if (options.nan_is_null) { - if (arr.type->id() == Type::FLOAT) { - const float* data = arr.GetValues(1); - for (int64_t i = 0; i < arr.length; ++i) { - if (std::isnan(data[i])) - BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); - } - - } else if (arr.type->id() == Type::DOUBLE) { - const double* data = arr.GetValues(1); - for (int64_t i = 0; i < arr.length; ++i) { - if (std::isnan(data[i])) - BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); + if (is_floating(arr.type->id())) { + const auto options = OptionsWrapper::Get(ctx); + if (options.nan_is_null) { + switch (arr.type->id()) { + case Type::FLOAT: + SetNanBits(arr, out); + break; + case Type::DOUBLE: + SetNanBits(arr, out); + break; + default: + return Status::NotImplemented("Type not implemented"); } } } From 14bdd007f4036e6178c10d7dc4ce7639eb444f92 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 11:23:24 -0500 Subject: [PATCH 38/66] Add NanNullOptions for IsNull kernel --- cpp/src/arrow/compute/api_scalar.cc | 5 ++++- cpp/src/arrow/compute/api_scalar.h | 12 +++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f9c95a7745b..7439cefe003 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -452,7 +452,6 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti // Validity functions SCALAR_EAGER_UNARY(IsValid, "is_valid") -SCALAR_EAGER_UNARY(IsNull, "is_null") SCALAR_EAGER_UNARY(IsNan, "is_nan") Result FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) { @@ -472,6 +471,10 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } +Result IsNull(const Datum& arg, const NanNullOptions& options, ExecContext* ctx) { + return CallFunction("is_null", {arg}, &options, ctx); +} + // ---------------------------------------------------------------------- // Temporal functions diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 115cad13d6f..fe3648f473e 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -209,6 +209,15 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { int64_t start, stop, step; }; +class ARROW_EXPORT NanNullOptions : public FunctionOptions { + public: + explicit NanNullOptions(bool nan_is_null); + NanNullOptions(); + constexpr static char const kTypeName[] = "NanNullOptions"; + + bool nan_is_null; +}; + enum CompareOperator : int8_t { EQUAL, NOT_EQUAL, @@ -741,13 +750,14 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// false otherwise /// /// \param[in] values input to examine for nullity +/// \param[in] options NanNullOptions /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, const NanNullOptions& options, ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise From 43219a1863eb97ffea687e306c7ba7b378c44c32 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 11:31:37 -0500 Subject: [PATCH 39/66] Add kNanNullOptionsType to RegisterScalarOptions --- cpp/src/arrow/compute/api_scalar.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 7439cefe003..25818cd27eb 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -159,6 +159,8 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( static auto kDayOfWeekOptionsType = GetFunctionOptionsType( DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering), DataMember("week_start", &DayOfWeekOptions::week_start)); +static auto kNanNullOptionsType = GetFunctionOptionsType( + DataMember("nan_is_null", &NanNullOptions::nan_is_null)); } // namespace } // namespace internal @@ -281,6 +283,12 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start week_start(week_start) {} constexpr char DayOfWeekOptions::kTypeName[]; +NanNullOptions::NanNullOptions(bool nan_is_null) + : FunctionOptions(internal::kNanNullOptionsType), + nan_is_null(nan_is_null) {} +NanNullOptions::NanNullOptions() : NanNullOptions(false) {} +constexpr char NanNullOptions::kTypeName[]; + namespace internal { void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType)); @@ -299,6 +307,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kNanNullOptionsType)); } } // namespace internal From 98ead778e6cdb6d4d8577ffa3d5cb7a370e0b33b Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 12:18:58 -0500 Subject: [PATCH 40/66] Add Init param for MafeFunction and docs for is_null kernel --- .../arrow/compute/kernels/scalar_validity.cc | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index ead88abc0f2..fd60d012a70 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/util/bit_util.h" @@ -76,6 +77,15 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + bool nan_is_null = false; + if (in.is_valid && in.ToString() == "nan") { + if (in.type == float16() || in.type == float32() || in.type == float64()) { + nan_is_null = true; + checked_cast(out)->value = nan_is_null; + return Status::OK(); + } + } + checked_cast(out)->value = !in.is_valid; return Status::OK(); } @@ -104,11 +114,11 @@ struct IsNanOperator { void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - MemAllocation::type mem_allocation, bool can_write_into_slices) { + MemAllocation::type mem_allocation, bool can_write_into_slices, const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR) { Arity arity{static_cast(in_types.size())}; - auto func = std::make_shared(name, arity, doc); + auto func = std::make_shared(name, arity, doc, default_options); - ScalarKernel kernel(std::move(in_types), out_type, exec); + ScalarKernel kernel(std::move(in_types), out_type, exec, init); kernel.null_handling = NullHandling::OUTPUT_NOT_NULL; kernel.can_write_into_slices = can_write_into_slices; kernel.mem_allocation = mem_allocation; @@ -202,9 +212,9 @@ const FunctionDoc is_inf_doc( ("For each input value, emit true iff the value is infinite (inf or -inf)."), {"values"}); -const FunctionDoc is_null_doc("Return true if null", - ("For each input value, emit true iff the value is null."), - {"values"}); +const FunctionDoc is_null_doc("Return true if null, NaN can be considered as null", + ("For each input value, emit true iff the value is null. Default behavior is to emit false for NaN values. True can be emitted for NaN values by toggling NanNullOptions flag."), + {"values"}, "NanNullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -212,13 +222,15 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true); + /*can_write_into_slices=*/true, &kNanNullOptions, OptionsWrapper::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); From f8514a84d8145ffc32eaf91731398ae5525e52df Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 12:24:50 -0500 Subject: [PATCH 41/66] Add defaults() method to NanNullOptions class --- cpp/src/arrow/compute/api_scalar.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index fe3648f473e..fdec9d655c5 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -213,6 +213,7 @@ class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: explicit NanNullOptions(bool nan_is_null); NanNullOptions(); + static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; bool nan_is_null; From e5a950d456f7ff32b75d04026f81836c6d98b34b Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 13:33:24 -0500 Subject: [PATCH 42/66] Add implementation for IsNullOperator, Scalar case --- cpp/src/arrow/compute/api_scalar.cc | 1 - cpp/src/arrow/compute/api_scalar.h | 5 ++--- cpp/src/arrow/compute/kernels/scalar_validity.cc | 12 +++++------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 25818cd27eb..1cff0b702a9 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -286,7 +286,6 @@ constexpr char DayOfWeekOptions::kTypeName[]; NanNullOptions::NanNullOptions(bool nan_is_null) : FunctionOptions(internal::kNanNullOptionsType), nan_is_null(nan_is_null) {} -NanNullOptions::NanNullOptions() : NanNullOptions(false) {} constexpr char NanNullOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index fdec9d655c5..d9ac5838f26 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -211,8 +211,7 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: - explicit NanNullOptions(bool nan_is_null); - NanNullOptions(); + explicit NanNullOptions(bool nan_is_null = false); static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; @@ -758,7 +757,7 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, const NanNullOptions& options, ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, NanNullOptions options = NanNullOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index fd60d012a70..b73cafc1dbb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -77,13 +77,11 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - bool nan_is_null = false; - if (in.is_valid && in.ToString() == "nan") { - if (in.type == float16() || in.type == float32() || in.type == float64()) { - nan_is_null = true; - checked_cast(out)->value = nan_is_null; - return Status::OK(); - } + auto options = OptionsWrapper::Get(ctx); + bool is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + if (in.is_valid && is_floating(in.type->id()) && options.nan_is_null && is_nan_value) { + checked_cast(out)->value = true; + return Status::OK(); } checked_cast(out)->value = !in.is_valid; From b2414e9b6c844bd4bc57ef810cddcffe5fb606d6 Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 18:35:42 -0500 Subject: [PATCH 43/66] Fix test compilation issue --- cpp/src/arrow/compute/api_scalar.cc | 2 +- cpp/src/arrow/compute/api_scalar.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 1cff0b702a9..335a255d785 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -479,7 +479,7 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } -Result IsNull(const Datum& arg, const NanNullOptions& options, ExecContext* ctx) { +Result IsNull(const Datum& arg, NanNullOptions options, ExecContext* ctx) { return CallFunction("is_null", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index d9ac5838f26..8dc4e995166 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -212,8 +212,8 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { class ARROW_EXPORT NanNullOptions : public FunctionOptions { public: explicit NanNullOptions(bool nan_is_null = false); - static NanNullOptions Defaults() { return NanNullOptions{}; } constexpr static char const kTypeName[] = "NanNullOptions"; + static NanNullOptions Defaults() { return NanNullOptions{}; } bool nan_is_null; }; From d3eb679b141bc692d30274c355cd42679f84696a Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 23:15:06 -0500 Subject: [PATCH 44/66] Apply clang format, add tests for isnull --- cpp/src/arrow/compute/api_scalar.cc | 3 +- cpp/src/arrow/compute/api_scalar.h | 4 ++- .../arrow/compute/kernels/scalar_validity.cc | 29 ++++++++++++++----- .../compute/kernels/scalar_validity_test.cc | 14 +++++++++ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 335a255d785..24c1dff6b2b 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -284,8 +284,7 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start constexpr char DayOfWeekOptions::kTypeName[]; NanNullOptions::NanNullOptions(bool nan_is_null) - : FunctionOptions(internal::kNanNullOptionsType), - nan_is_null(nan_is_null) {} + : FunctionOptions(internal::kNanNullOptionsType), nan_is_null(nan_is_null) {} constexpr char NanNullOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 8dc4e995166..c9fb36d371f 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -757,7 +757,9 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, NanNullOptions options = NanNullOptions::Defaults(), ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, + NanNullOptions options = NanNullOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index b73cafc1dbb..fa61074468a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -77,9 +77,18 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { + bool is_nan_value = false; + bool is_floating = false; + if (in.type == float32()) { + is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + is_floating = true; + } else if (in.type == float64()) { + is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); + is_floating = true; + } + auto options = OptionsWrapper::Get(ctx); - bool is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - if (in.is_valid && is_floating(in.type->id()) && options.nan_is_null && is_nan_value) { + if (in.is_valid && is_floating && options.nan_is_null && is_nan_value) { checked_cast(out)->value = true; return Status::OK(); } @@ -112,7 +121,9 @@ struct IsNanOperator { void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - MemAllocation::type mem_allocation, bool can_write_into_slices, const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR) { + MemAllocation::type mem_allocation, bool can_write_into_slices, + const FunctionOptions* default_options = NULLPTR, + KernelInit init = NULLPTR) { Arity arity{static_cast(in_types.size())}; auto func = std::make_shared(name, arity, doc, default_options); @@ -210,9 +221,12 @@ const FunctionDoc is_inf_doc( ("For each input value, emit true iff the value is infinite (inf or -inf)."), {"values"}); -const FunctionDoc is_null_doc("Return true if null, NaN can be considered as null", - ("For each input value, emit true iff the value is null. Default behavior is to emit false for NaN values. True can be emitted for NaN values by toggling NanNullOptions flag."), - {"values"}, "NanNullOptions"); +const FunctionDoc is_null_doc( + "Return true if null, NaN can be considered as null", + ("For each input value, emit true iff the value is null. Default behavior is to emit " + "false for NaN values. True can be emitted for NaN values by toggling " + "NanNullOptions flag."), + {"values"}, "NanNullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -228,7 +242,8 @@ void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true, &kNanNullOptions, OptionsWrapper::Init); + /*can_write_into_slices=*/true, &kNanNullOptions, + OptionsWrapper::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 1a7a1cbda15..e4b5698383b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -75,6 +75,20 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); + + // TODO: This tests could be helpful (scalar for now works well, but not ArrayData still + // not) By default 'NaN' value is not considered as null + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, false]")); + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, false]")); + + // Setting 'nan_is_null' as true, 'NaN value will be considered as null + const NanNullOptions& options = NanNullOptions(true); + // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), + // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only + // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { From 12b7c9e008df08c05a8789fcfac3dbb7c48fae1d Mon Sep 17 00:00:00 2001 From: christian Date: Fri, 6 Aug 2021 23:20:51 -0500 Subject: [PATCH 45/66] Improve message for is_null tests --- cpp/src/arrow/compute/kernels/scalar_validity_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index e4b5698383b..1988cd5a200 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -76,8 +76,7 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); - // TODO: This tests could be helpful (scalar for now works well, but not ArrayData still - // not) By default 'NaN' value is not considered as null + // By default 'NaN' value is not considered as null CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), ArrayFromJSON(boolean(), "[true, false, false]")); CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), @@ -85,6 +84,7 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { // Setting 'nan_is_null' as true, 'NaN value will be considered as null const NanNullOptions& options = NanNullOptions(true); + // TODO: These tests could be helpful (Scalar for now works, but not ArrayData) // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), From 8f82b98b5a2ec9ecb2779fbbae929eff9f3ab3ad Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 9 Aug 2021 16:26:55 -0500 Subject: [PATCH 46/66] Apply requested changes for IsNullOperator --- .../arrow/compute/kernels/scalar_validity.cc | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index fa61074468a..2cac6abeb8c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -77,23 +79,26 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - bool is_nan_value = false; - bool is_floating = false; - if (in.type == float32()) { - is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - is_floating = true; - } else if (in.type == float64()) { - is_nan_value = std::isnan(internal::UnboxScalar::Unbox(in)); - is_floating = true; - } - auto options = OptionsWrapper::Get(ctx); - if (in.is_valid && is_floating && options.nan_is_null && is_nan_value) { - checked_cast(out)->value = true; - return Status::OK(); + bool* out_value = &checked_cast(out)->value; + if (in.is_valid) { + switch (in.type->id()) { + case Type::FLOAT: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + case Type::DOUBLE: + *out_value = options.nan_is_null && + std::isnan(internal::UnboxScalar::Unbox(in)); + break; + default: + *out_value = false; + break; + } + } else { + *out_value = true; } - checked_cast(out)->value = !in.is_valid; return Status::OK(); } @@ -234,8 +239,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From f0010688d2b32540ac9bda4befebbb4e47191346 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 9 Aug 2021 16:48:39 -0500 Subject: [PATCH 47/66] Remove default break, add todo to handle ArrayData for is_null --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 2cac6abeb8c..c11c5ef8e56 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -93,7 +93,6 @@ struct IsNullOperator { break; default: *out_value = false; - break; } } else { *out_value = true; @@ -103,6 +102,9 @@ struct IsNullOperator { } static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { + // TODO: Is `options` needed for detect nulls? Which is the better way to + // handle is_null for ArrayData + auto options = OptionsWrapper::Get(ctx); if (arr.MayHaveNulls()) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, From c64a30ac864198c904176c800cfcac5339d9c9f2 Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 10 Aug 2021 15:50:01 -0500 Subject: [PATCH 48/66] Apply SetBitsTo for NaN values when passed NanNullOptions --- .../arrow/compute/kernels/scalar_validity.cc | 26 +++++++++++++++---- .../compute/kernels/scalar_validity_test.cc | 12 ++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index c11c5ef8e56..9c3d479fff5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -102,9 +100,6 @@ struct IsNullOperator { } static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { - // TODO: Is `options` needed for detect nulls? Which is the better way to - // handle is_null for ArrayData - auto options = OptionsWrapper::Get(ctx); if (arr.MayHaveNulls()) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, @@ -114,6 +109,25 @@ struct IsNullOperator { BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, false); } + + auto options = OptionsWrapper::Get(ctx); + if (options.nan_is_null) { + if (arr.type->id() == Type::FLOAT) { + const float* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); + } + + } else if (arr.type->id() == Type::DOUBLE) { + const double* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) + BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), i, 1, true); + } + } + } + return Status::OK(); } }; @@ -241,6 +255,8 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 1988cd5a200..c248f241719 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -82,13 +82,13 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), ArrayFromJSON(boolean(), "[true, false, false]")); - // Setting 'nan_is_null' as true, 'NaN value will be considered as null + // Setting 'nan_is_null' as true, 'NaN value could be considered as null (floating + // points) const NanNullOptions& options = NanNullOptions(true); - // TODO: These tests could be helpful (Scalar for now works, but not ArrayData) - // CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), - // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only - // CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - // ArrayFromJSON(boolean(), "[true, false, true]"), &options); // works for scalar only + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[5.0, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[false, false, true]"), &options); + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), + ArrayFromJSON(boolean(), "[true, false, true]"), &options); } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { From 2e3c8a6213df0d63be7056009706908ab4532a3d Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 09:40:52 -0500 Subject: [PATCH 49/66] move kNanNullOptions to anonymous namespace --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 9c3d479fff5..f98d4ce85e8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -255,8 +257,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From 5abfa2e48e9852c668b528c40b530dcf9e26eeed Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 12:14:49 -0500 Subject: [PATCH 50/66] Fix for arrow-compute-expression-test (is_null) --- cpp/src/arrow/compute/exec/expression.cc | 5 ++++- cpp/src/arrow/compute/exec/expression.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index 67a9f3c40ff..34d21d6c87f 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -1154,7 +1154,10 @@ Expression greater_equal(Expression lhs, Expression rhs) { return call("greater_equal", {std::move(lhs), std::move(rhs)}); } -Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); } +Expression is_null(Expression lhs, bool nan_is_null) { + return call("is_null", {std::move(lhs)}, + compute::NanNullOptions(std::move(nan_is_null))); +} Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); } diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index 3810accf70a..dac5728ab46 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -255,7 +255,7 @@ ARROW_EXPORT Expression greater(Expression lhs, Expression rhs); ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs); -ARROW_EXPORT Expression is_null(Expression lhs); +ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false); ARROW_EXPORT Expression is_valid(Expression lhs); From 35df5d48ee72ac7939b90e3141c1bbe56d7ffe19 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 15:34:33 -0500 Subject: [PATCH 51/66] Add bindings in cython --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 ++-- python/pyarrow/_compute.pyx | 6 ++++++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 +++++ python/pyarrow/tests/test_compute.py | 1 + 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index f98d4ce85e8..9c3d479fff5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -257,6 +255,8 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 2d82928f377..516e0cddf44 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,12 +796,18 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) +cdef class _NanNullOptions(FunctionOptions): + def _set_options(self, bool nan_is_null): + self.wrapped.reset(new CNanNullOptions(nan_is_null)) class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) +class NanNullOptions(_NanNullOptions): + def __init__(self, bool nan_is_null): + self._set_options(nan_is_null) cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 10880c2974c..63ff6d92a3b 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -49,6 +49,7 @@ ScalarAggregateOptions, SetLookupOptions, SliceOptions, + NanNullOptions, SortOptions, SplitOptions, SplitPatternOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9260cd28f85..1566c289a13 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1863,6 +1863,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int64_t start int64_t stop int64_t step + + cdef cppclass CNanNullOptions \ + "arrow::compute::NanNullOptions"(CFunctionOptions): + CNanNullOptions(bool nan_is_null) + bool nan_is_null cdef cppclass CSplitOptions \ "arrow::compute::SplitOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 26d1f0e647c..edaa1a081e2 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -126,6 +126,7 @@ def test_option_class_equality(): pc.ReplaceSubstringOptions("a", "b"), pc.SetLookupOptions(value_set=pa.array([1])), pc.SliceOptions(start=0, stop=1, step=1), + pc.NanNullOptions(nan_is_null=False), pc.SplitPatternOptions(pattern="pattern"), pc.StrptimeOptions("%Y", "s"), pc.TrimOptions(" "), From cfab892eeace25cfe2b2d6273687576e92889828 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 11 Aug 2021 19:48:59 -0500 Subject: [PATCH 52/66] Add specialized tests for is_null --- cpp/src/arrow/compute/api_scalar.h | 2 +- .../arrow/compute/kernels/scalar_validity.cc | 8 +- .../compute/kernels/scalar_validity_test.cc | 101 +++++++++++++++--- 3 files changed, 90 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index c9fb36d371f..46dba2514dc 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -215,7 +215,7 @@ class ARROW_EXPORT NanNullOptions : public FunctionOptions { constexpr static char const kTypeName[] = "NanNullOptions"; static NanNullOptions Defaults() { return NanNullOptions{}; } - bool nan_is_null; + bool nan_is_null = false; }; enum CompareOperator : int8_t { diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 9c3d479fff5..09a468fcd3b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,6 +32,8 @@ namespace compute { namespace internal { namespace { +FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); + struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -77,7 +79,7 @@ struct IsInfOperator { struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - auto options = OptionsWrapper::Get(ctx); + const NanNullOptions& options = OptionsWrapper::Get(ctx); bool* out_value = &checked_cast(out)->value; if (in.is_valid) { switch (in.type->id()) { @@ -110,7 +112,7 @@ struct IsNullOperator { false); } - auto options = OptionsWrapper::Get(ctx); + const NanNullOptions& options = OptionsWrapper::Get(ctx); if (options.nan_is_null) { if (arr.type->id() == Type::FLOAT) { const float* data = arr.GetValues(1); @@ -255,8 +257,6 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index c248f241719..c23ce987b79 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -75,20 +75,94 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), "[true, false, false, true]"); +} - // By default 'NaN' value is not considered as null - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, false]")); - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, false]")); +TEST_F(TestFloatValidityKernels, FloatArrayIsNull) { + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); +} - // Setting 'nan_is_null' as true, 'NaN value could be considered as null (floating - // points) +TEST_F(TestFloatValidityKernels, FloatArrayIsNullUsingNanNullOptions) { const NanNullOptions& options = NanNullOptions(true); - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[5.0, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[false, false, true]"), &options); - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[null, 2.0, NaN]"), - ArrayFromJSON(boolean(), "[true, false, true]"), &options); + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), + &options); +} + +TEST_F(TestDoubleValidityKernels, DoubleArrayIsNull) { + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); +} + +TEST_F(TestDoubleValidityKernels, DoubleArrayIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + // All NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); + // No NaN + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); + // Some NaNs + CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), + &options); +} + +TEST_F(TestFloatValidityKernels, FloatScalarIsNull) { + CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false)); + CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true)); +} + +TEST_F(TestFloatValidityKernels, FloatScalarIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(true), &options); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options); +} + +TEST_F(TestDoubleValidityKernels, DoubleScalarIsNull) { + CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(false)); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false)); + CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); +} + +TEST_F(TestDoubleValidityKernels, DoubleScalarIsNullUsingNanNullOptions) { + const NanNullOptions& options = NanNullOptions(true); + CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(true), &options); + CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), + MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true), &options); } TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { @@ -97,11 +171,6 @@ TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { ASSERT_EQ(result->null_count, 0); } -TEST_F(TestBooleanValidityKernels, ScalarIsNull) { - CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false)); - CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); -} - TEST_F(TestFloatValidityKernels, FloatArrayIsFinite) { // All Inf CheckScalarUnary("is_finite", ArrayFromJSON(float32(), "[Inf, -Inf, Inf, -Inf, Inf]"), From e92318dd663741e4a8da2e5dd9624936863377ea Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 00:15:10 -0500 Subject: [PATCH 53/66] Fix Sanitizer for KNanNullOptions --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 09a468fcd3b..59f5cdb00a5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,8 +32,6 @@ namespace compute { namespace internal { namespace { -FunctionOptions kNanNullOptions = NanNullOptions::Defaults(); - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -258,6 +256,7 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace void RegisterScalarValidity(FunctionRegistry* registry) { + static auto kNanNullOptions = NanNullOptions::Defaults(); MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); From f2b439ec7d9f5c65557be4a5c22d7aaa261d129c Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 00:25:02 -0500 Subject: [PATCH 54/66] Remove cython bindigs --- python/pyarrow/_compute.pyx | 8 -------- python/pyarrow/compute.py | 1 - python/pyarrow/includes/libarrow.pxd | 5 ----- python/pyarrow/tests/test_compute.py | 1 - 4 files changed, 15 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 516e0cddf44..aa6e5cfde94 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,19 +796,11 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) -cdef class _NanNullOptions(FunctionOptions): - def _set_options(self, bool nan_is_null): - self.wrapped.reset(new CNanNullOptions(nan_is_null)) - class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) -class NanNullOptions(_NanNullOptions): - def __init__(self, bool nan_is_null): - self._set_options(nan_is_null) - cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): if null_selection_behavior == 'drop': diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 63ff6d92a3b..10880c2974c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -49,7 +49,6 @@ ScalarAggregateOptions, SetLookupOptions, SliceOptions, - NanNullOptions, SortOptions, SplitOptions, SplitPatternOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 1566c289a13..9260cd28f85 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1863,11 +1863,6 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int64_t start int64_t stop int64_t step - - cdef cppclass CNanNullOptions \ - "arrow::compute::NanNullOptions"(CFunctionOptions): - CNanNullOptions(bool nan_is_null) - bool nan_is_null cdef cppclass CSplitOptions \ "arrow::compute::SplitOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index edaa1a081e2..26d1f0e647c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -126,7 +126,6 @@ def test_option_class_equality(): pc.ReplaceSubstringOptions("a", "b"), pc.SetLookupOptions(value_set=pa.array([1])), pc.SliceOptions(start=0, stop=1, step=1), - pc.NanNullOptions(nan_is_null=False), pc.SplitPatternOptions(pattern="pattern"), pc.StrptimeOptions("%Y", "s"), pc.TrimOptions(" "), From a3707a766b4cd645cc956352787ecd680344486b Mon Sep 17 00:00:00 2001 From: christian Date: Thu, 12 Aug 2021 17:47:03 -0500 Subject: [PATCH 55/66] binding in cython layer, fix python tests (theorically) --- python/pyarrow/_compute.pyx | 14 ++++++++++++++ python/pyarrow/_dataset.pyx | 8 ++++++-- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 +++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index aa6e5cfde94..aeadaaac6a6 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -796,11 +796,13 @@ cdef class _SliceOptions(FunctionOptions): def _set_options(self, int64_t start, int64_t stop, int64_t step): self.wrapped.reset(new CSliceOptions(start, stop, step)) + class SliceOptions(_SliceOptions): def __init__(self, int64_t start, int64_t stop=sys.maxsize, int64_t step=1): self._set_options(start, stop, step) + cdef class _FilterOptions(FunctionOptions): def _set_options(self, null_selection_behavior): if null_selection_behavior == 'drop': @@ -991,6 +993,18 @@ class DayOfWeekOptions(_DayOfWeekOptions): self._set_options(one_based_numbering, week_start) +cdef class _NanNullOptions(FunctionOptions): + def _set_options(self, nan_is_null): + self.wrapped.reset( + new CNanNullOptions(nan_is_null) + ) + + +class NanNullOptions(_NanNullOptions): + def __init__(self, nan_is_null=False): + self._set_options(nan_is_null) + + cdef class _VarianceOptions(FunctionOptions): def _set_options(self, ddof): self.wrapped.reset(new CVarianceOptions(ddof)) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 945475bd7f1..f2aecc7d60d 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -234,9 +234,13 @@ cdef class Expression(_Weakrefable): """Checks whether the expression is not-null (valid)""" return Expression._call("is_valid", [self]) - def is_null(self): + def is_null(self, bint nan_is_null=False): """Checks whether the expression is null""" - return Expression._call("is_null", [self]) + cdef: + shared_ptr[CFunctionOptions] c_options + + c_options.reset(new CNanNullOptions(nan_is_null)) + return Expression._call("is_null", [self], c_options) def cast(self, type, bint safe=True): """Explicitly change the expression's data type""" diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 10880c2974c..4efa9fbfc41 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -54,6 +54,7 @@ SplitPatternOptions, StrptimeOptions, DayOfWeekOptions, + NanNullOptions, TakeOptions, TDigestOptions, TrimOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9260cd28f85..66cf7e9a460 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1957,6 +1957,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: c_bool one_based_numbering uint32_t week_start + cdef cppclass CNanNullOptions \ + "arrow::compute::NanNullOptions"(CFunctionOptions): + CNanNullOptions(c_bool nan_is_null) + c_bool nan_is_null + cdef cppclass CVarianceOptions \ "arrow::compute::VarianceOptions"(CFunctionOptions): CVarianceOptions(int ddof) From 3028a36d1579d73459bf3762420b9743acc11e66 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 17 Aug 2021 23:36:11 -0400 Subject: [PATCH 56/66] remove enable_if and add NanNullState alias --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 9ff95f5f67f..a3914394682 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -32,9 +32,6 @@ namespace compute { namespace internal { namespace { -template -using enable_if_floating_point = enable_if_t::value, R>; - struct IsValidOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { checked_cast(out)->value = in.is_valid; @@ -78,9 +75,11 @@ struct IsInfOperator { } }; +using NanNullState = OptionsWrapper; + struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - const auto options = OptionsWrapper::Get(ctx); + const auto options = NanNullState::Get(ctx); bool* out_value = &checked_cast(out)->value; if (in.is_valid) { if (is_floating(in.type->id())) { @@ -98,7 +97,6 @@ struct IsNullOperator { } } else { *out_value = false; - } } else { *out_value = true; @@ -108,8 +106,7 @@ struct IsNullOperator { } template - static enable_if_floating_point SetNanBits(const ArrayData& arr, - ArrayData* out) { + static void SetNanBits(const ArrayData& arr, ArrayData* out) { const T* data = arr.GetValues(1); for (int64_t i = 0; i < arr.length; ++i) { if (std::isnan(data[i])) { @@ -130,7 +127,7 @@ struct IsNullOperator { } if (is_floating(arr.type->id())) { - const auto options = OptionsWrapper::Get(ctx); + const auto options = NanNullState::Get(ctx); if (options.nan_is_null) { switch (arr.type->id()) { case Type::FLOAT: @@ -279,8 +276,7 @@ void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true, &kNanNullOptions, - OptionsWrapper::Init); + /*can_write_into_slices=*/true, &kNanNullOptions, NanNullState::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); From 3807e9825048f8cbe7461235f003721d36978105 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 17 Aug 2021 23:36:57 -0400 Subject: [PATCH 57/66] update documentation --- docs/source/cpp/compute.rst | 57 +++++++++++++++++++------------------ python/pyarrow/_dataset.pyx | 2 +- python/pyarrow/array.pxi | 4 +-- python/pyarrow/table.pxi | 4 +-- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 39bbbec3e16..6faec6467ec 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -899,33 +899,33 @@ Structural transforms .. XXX (this category is a bit of a hodgepodge) -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| Function name | Arity | Input types | Output type | Notes | -+==========================+============+===================================================+=====================+=========+ -| case_when | Varargs | Struct of Boolean (Arg 0), Any fixed-width (rest) | Input type | \(1) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| choose | Varargs | Integral (Arg 0); Fixed-width/Binary-like (rest) | Input type | \(2) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| coalesce | Varargs | Any | Input type | \(3) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(4) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | \(5) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_finite | Unary | Float, Double | Boolean | \(6) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_inf | Unary | Float, Double | Boolean | \(7) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_nan | Unary | Float, Double | Boolean | \(8) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_null | Unary | Any | Boolean | \(9) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_valid | Unary | Any | Boolean | \(10) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| list_value_length | Unary | List-like | Int32 or Int64 | \(11) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| make_struct | Varargs | Any | Struct | \(12) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===================+=========+===================================================+================+=============================+=======+ +| case_when | Varargs | Struct of Boolean (Arg 0), Any fixed-width (rest) | Input type | | \(1) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| choose | Varargs | Integral (Arg 0); Fixed-width/Binary-like (rest) | Input type | | \(2) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| coalesce | Varargs | Any | Input type | | \(3) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | | \(4) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | | \(5) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| is_finite | Unary | Float, Double | Boolean | | \(6) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| is_inf | Unary | Float, Double | Boolean | | \(7) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| is_nan | Unary | Float, Double | Boolean | | \(8) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| is_null | Unary | Any | Boolean | :struct:`NanNullOptions` | \(9) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| is_valid | Unary | Any | Boolean | | \(10) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| list_value_length | Unary | List-like | Int32 or Int64 | | \(11) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ +| make_struct | Varargs | Any | Struct | :struct:`MakeStructOptions` | \(12) | ++-------------------+---------+---------------------------------------------------+----------------+-----------------------------+-------+ * \(1) This function acts like a SQL 'case when' statement or switch-case. The input is a "condition" value, which is a struct of Booleans, followed by the @@ -966,7 +966,8 @@ Structural transforms * \(8) Output is true iff the corresponding input element is NaN. -* \(9) Output is true iff the corresponding input element is null. +* \(9) Output is true if the corresponding input element is null or if NaN + values are treated as null via the :struct:`NanNullOptions`. * \(10) Output is true iff the corresponding input element is non-null. diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index f2aecc7d60d..4ad893f4764 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -238,7 +238,7 @@ cdef class Expression(_Weakrefable): """Checks whether the expression is null""" cdef: shared_ptr[CFunctionOptions] c_options - + c_options.reset(new CNanNullOptions(nan_is_null)) return Expression._call("is_null", [self], c_options) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 62523696c8b..c31aa75863a 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1039,11 +1039,11 @@ cdef class Array(_PandasConvertible): else: return 0 - def is_null(self): + def is_null(self, nan_is_null=False): """ Return BooleanArray indicating the null values. """ - return _pc().is_null(self) + return _pc().is_null(self, nan_is_null) def is_valid(self): """ diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index d92bdb2efa3..7d16c851ef6 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -170,11 +170,11 @@ cdef class ChunkedArray(_PandasConvertible): else: index -= self.chunked_array.chunk(j).get().length() - def is_null(self): + def is_null(self, nan_is_null=False): """ Return BooleanArray indicating the null values. """ - return _pc().is_null(self) + return _pc().is_null(self, nan_is_null) def is_valid(self): """ From 8d26437ae89b7b83146348f1c85063ff7f09725f Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 18 Aug 2021 11:21:41 -0400 Subject: [PATCH 58/66] update function doc --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index a3914394682..ed424810922 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -257,8 +257,8 @@ const FunctionDoc is_inf_doc( {"values"}); const FunctionDoc is_null_doc( - "Return true if null, NaN can be considered as null", - ("For each input value, emit true iff the value is null. Default behavior is to emit " + "Return true if null, NaN values can be considered as null", + ("For each input value, emit true if the value is null. Default behavior is to emit " "false for NaN values. True can be emitted for NaN values by toggling " "NanNullOptions flag."), {"values"}, "NanNullOptions"); From 4ad9337e0385a9492156cc62cef51d6f47fc44f8 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 18 Aug 2021 11:22:32 -0400 Subject: [PATCH 59/66] fix NanNullOptions in Python bindings --- python/pyarrow/array.pxi | 3 ++- python/pyarrow/table.pxi | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index c31aa75863a..832ec8eac1a 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1043,7 +1043,8 @@ cdef class Array(_PandasConvertible): """ Return BooleanArray indicating the null values. """ - return _pc().is_null(self, nan_is_null) + options = _pc().NanNullOptions(nan_is_null) + return _pc().call_function('is_null', [self], options) def is_valid(self): """ diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 7d16c851ef6..b6366c7a5d8 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -174,7 +174,8 @@ cdef class ChunkedArray(_PandasConvertible): """ Return BooleanArray indicating the null values. """ - return _pc().is_null(self, nan_is_null) + options = _pc().NanNullOptions(nan_is_null) + return _pc().call_function('is_null', [self], options) def is_valid(self): """ From 915c86894a1669b7bcd905986c249752015d77b4 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 18 Aug 2021 17:18:32 -0500 Subject: [PATCH 60/66] Requested changes for scalar_validity_test and docs --- .../compute/kernels/scalar_validity_test.cc | 47 ++++++++++--------- docs/source/cpp/compute.rst | 4 +- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index c23ce987b79..2ef04f5edd0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -78,19 +78,23 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { } TEST_F(TestFloatValidityKernels, FloatArrayIsNull) { + // Default NanNullOptions (nan_is_null = false) + NanNullOptions options; // All NaN CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + ArrayFromJSON(boolean(), "[false, false, false, false, false]"), + &options); // No NaN CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); // Some NaNs CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); -} + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); -TEST_F(TestFloatValidityKernels, FloatArrayIsNullUsingNanNullOptions) { - const NanNullOptions& options = NanNullOptions(true); + // Set nan_is_null value to true + options.nan_is_null = true; // All NaN CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); @@ -108,16 +112,18 @@ TEST_F(TestDoubleValidityKernels, DoubleArrayIsNull) { // All NaN CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + + NanNullOptions options{false}; // No NaN CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); // Some NaNs CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]")); -} + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &options); -TEST_F(TestDoubleValidityKernels, DoubleArrayIsNullUsingNanNullOptions) { - const NanNullOptions& options = NanNullOptions(true); + options.nan_is_null = true; // All NaN CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); @@ -134,13 +140,13 @@ TEST_F(TestDoubleValidityKernels, DoubleArrayIsNullUsingNanNullOptions) { TEST_F(TestFloatValidityKernels, FloatScalarIsNull) { CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false)); CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(false)); + + NanNullOptions options{false}; CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true)); -} + MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options); -TEST_F(TestFloatValidityKernels, FloatScalarIsNullUsingNanNullOptions) { - const NanNullOptions& options = NanNullOptions(true); + options.nan_is_null = true; CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false), &options); CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(true), &options); CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), @@ -149,15 +155,14 @@ TEST_F(TestFloatValidityKernels, FloatScalarIsNullUsingNanNullOptions) { } TEST_F(TestDoubleValidityKernels, DoubleScalarIsNull) { - CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false)); - CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(false)); + NanNullOptions options{false}; + CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options); + CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(false), &options); CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), MakeScalar(false)); CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); -} -TEST_F(TestDoubleValidityKernels, DoubleScalarIsNullUsingNanNullOptions) { - const NanNullOptions& options = NanNullOptions(true); + options.nan_is_null = true; CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options); CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(true), &options); CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 6faec6467ec..ec1218c0d34 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -966,8 +966,8 @@ Structural transforms * \(8) Output is true iff the corresponding input element is NaN. -* \(9) Output is true if the corresponding input element is null or if NaN - values are treated as null via the :struct:`NanNullOptions`. +* \(9) Output is true if the corresponding input element is null. NaN values + can be considered as null via the :struct:`NanNullOptions`. * \(10) Output is true iff the corresponding input element is non-null. From 4588822f18bcffa1819c82a60065f87bb6f47ebd Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 18 Aug 2021 18:05:59 -0500 Subject: [PATCH 61/66] Add python test using nan_is_null --- python/pyarrow/tests/test_compute.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 26d1f0e647c..d7eb4ddf302 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1225,7 +1225,6 @@ def test_arithmetic_multiply(): def test_is_null(): arr = pa.array([1, 2, 3, None]) result = arr.is_null() - result = arr.is_null() expected = pa.array([False, False, False, True]) assert result.equals(expected) assert result.equals(pc.is_null(arr)) @@ -1242,6 +1241,15 @@ def test_is_null(): expected = pa.chunked_array([[True, True], [True, False]]) assert result.equals(expected) + arr = pa.array([1, 2, 3, None, np.nan]) + result = arr.is_null() + expected = pa.array([False, False, False, True, False]) + assert result.equals(expected) + + result = arr.is_null(nan_is_null=True) + expected = pa.array([False, False, False, True, True]) + assert result.equals(expected) + def test_fill_null(): arr = pa.array([1, 2, None, 4], type=pa.int8()) From 9a4f39a27e5fff5247e3d42aee6b79fbb6430cf0 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 20 Aug 2021 18:46:25 -0400 Subject: [PATCH 62/66] Update R bindings --- r/R/arrow-datum.R | 8 +------- r/R/dplyr-functions.R | 9 +-------- r/R/expression.R | 8 +------- r/src/compute.cpp | 9 +++++++++ 4 files changed, 12 insertions(+), 22 deletions(-) diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index b3635f239c4..b1ff18fe434 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -49,13 +49,7 @@ is.infinite.ArrowDatum <- function(x) { #' @export is.na.ArrowDatum <- function(x) { - # TODO: if an option is added to the is_null kernel to treat NaN as NA, - # use that to simplify the code here (ARROW-13367) - if (x$type_id() %in% TYPES_WITH_NAN) { - call_function("is_nan", x) | call_function("is_null", x) - } else { - call_function("is_null", x) - } + call_function("is_null", x, options = list(nan_is_null = TRUE)) } #' @export diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 9972d4796a8..8bbbdce98d1 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -96,14 +96,7 @@ nse_funcs$coalesce <- function(...) { } nse_funcs$is.na <- function(x) { - # TODO: if an option is added to the is_null kernel to treat NaN as NA, - # use that to simplify the code here (ARROW-13367) - if (is.double(x) || (inherits(x, "Expression") && - x$type_id() %in% TYPES_WITH_NAN)) { - build_expr("is_nan", x) | build_expr("is_null", x) - } else { - build_expr("is_null", x) - } + build_expr("is_null", x, options = list(nan_is_null = TRUE)) } nse_funcs$is.nan <- function(x) { diff --git a/r/R/expression.R b/r/R/expression.R index 0526eb73bc9..57466cc3c71 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -231,11 +231,5 @@ Ops.Expression <- function(e1, e2) { #' @export is.na.Expression <- function(x) { - if (!is.null(x$schema) && x$type_id() %in% TYPES_WITH_NAN) { - # TODO: if an option is added to the is_null kernel to treat NaN as NA, - # use that to simplify the code here (ARROW-13367) - Expression$create("is_nan", x) | build_expr("is_null", x) - } else { - Expression$create("is_null", x) - } + Expression$create("is_null", x, options = list(nan_is_null = TRUE)) } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 0695e2525f7..2910f270262 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -220,6 +220,15 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["skip_nulls"])); } + if (func_name == "is_null") { + using Options = arrow::compute::NanNullOptions; + auto out = std::make_shared(Options::Defaults()); + if (!Rf_isNull(options["nan_is_null"])) { + out->nan_is_null = cpp11::as_cpp(options["nan_is_null"]); + } + return out; + } + if (func_name == "dictionary_encode") { using Options = arrow::compute::DictionaryEncodeOptions; auto out = std::make_shared(Options::Defaults()); From 71b85d2341278a3c5798a4e14412f5987a50b856 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 25 Aug 2021 10:02:45 -0500 Subject: [PATCH 63/66] Add cpp/submodules --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 600d437de0e..ddd89895880 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 600d437de0e8b0e9927c87e76f844a1b385b02e8 +Subproject commit ddd898958803cb89b7156c6350584d1cda0fe8de From 7804c492635b8ab33b9a77b5cf04b6d89e6f1850 Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 25 Aug 2021 10:43:28 -0500 Subject: [PATCH 64/66] parquet_testing sub without changes --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index ddd89895880..600d437de0e 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit ddd898958803cb89b7156c6350584d1cda0fe8de +Subproject commit 600d437de0e8b0e9927c87e76f844a1b385b02e8 From 30751fcb949647eb88f52dc1aa0cc61c53dc90d6 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 26 Aug 2021 15:11:35 +0200 Subject: [PATCH 65/66] Address review comments, also restructure docs a bit --- cpp/src/arrow/compute/api_scalar.cc | 14 +- cpp/src/arrow/compute/api_scalar.h | 13 +- cpp/src/arrow/compute/exec/expression.cc | 3 +- .../arrow/compute/kernels/scalar_validity.cc | 70 ++-- .../compute/kernels/scalar_validity_test.cc | 320 ++++++------------ docs/source/cpp/compute.rst | 114 ++++--- python/pyarrow/_compute.pyx | 6 +- python/pyarrow/_dataset.pyx | 2 +- python/pyarrow/array.pxi | 13 +- python/pyarrow/compute.py | 6 +- python/pyarrow/includes/libarrow.pxd | 6 +- python/pyarrow/table.pxi | 17 +- r/src/compute.cpp | 2 +- 13 files changed, 248 insertions(+), 338 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index fc27905d7aa..599ce75bece 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -161,8 +161,8 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( static auto kDayOfWeekOptionsType = GetFunctionOptionsType( DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering), DataMember("week_start", &DayOfWeekOptions::week_start)); -static auto kNanNullOptionsType = GetFunctionOptionsType( - DataMember("nan_is_null", &NanNullOptions::nan_is_null)); +static auto kNullOptionsType = GetFunctionOptionsType( + DataMember("nan_is_null", &NullOptions::nan_is_null)); } // namespace } // namespace internal @@ -293,9 +293,9 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start week_start(week_start) {} constexpr char DayOfWeekOptions::kTypeName[]; -NanNullOptions::NanNullOptions(bool nan_is_null) - : FunctionOptions(internal::kNanNullOptionsType), nan_is_null(nan_is_null) {} -constexpr char NanNullOptions::kTypeName[]; +NullOptions::NullOptions(bool nan_is_null) + : FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {} +constexpr char NullOptions::kTypeName[]; namespace internal { void RegisterScalarOptions(FunctionRegistry* registry) { @@ -316,7 +316,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType)); - DCHECK_OK(registry->AddFunctionOptionsType(kNanNullOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType)); } } // namespace internal @@ -489,7 +489,7 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } -Result IsNull(const Datum& arg, NanNullOptions options, ExecContext* ctx) { +Result IsNull(const Datum& arg, NullOptions options, ExecContext* ctx) { return CallFunction("is_null", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 92ff517152b..769ab0e7874 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -224,11 +224,11 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { int64_t start, stop, step; }; -class ARROW_EXPORT NanNullOptions : public FunctionOptions { +class ARROW_EXPORT NullOptions : public FunctionOptions { public: - explicit NanNullOptions(bool nan_is_null = false); - constexpr static char const kTypeName[] = "NanNullOptions"; - static NanNullOptions Defaults() { return NanNullOptions{}; } + explicit NullOptions(bool nan_is_null = false); + constexpr static char const kTypeName[] = "NullOptions"; + static NullOptions Defaults() { return NullOptions{}; } bool nan_is_null = false; }; @@ -765,15 +765,14 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// false otherwise /// /// \param[in] values input to examine for nullity -/// \param[in] options NanNullOptions +/// \param[in] options NullOptions /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, - NanNullOptions options = NanNullOptions::Defaults(), +Result IsNull(const Datum& values, NullOptions options = NullOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index 34d21d6c87f..64e3305825d 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -1155,8 +1155,7 @@ Expression greater_equal(Expression lhs, Expression rhs) { } Expression is_null(Expression lhs, bool nan_is_null) { - return call("is_null", {std::move(lhs)}, - compute::NanNullOptions(std::move(nan_is_null))); + return call("is_null", {std::move(lhs)}, compute::NullOptions(std::move(nan_is_null))); } Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); } diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index ed424810922..d23a909c6fd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -75,25 +75,25 @@ struct IsInfOperator { } }; -using NanNullState = OptionsWrapper; +using NanOptionsState = OptionsWrapper; struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - const auto options = NanNullState::Get(ctx); + const auto& options = NanOptionsState::Get(ctx); bool* out_value = &checked_cast(out)->value; + if (in.is_valid) { - if (is_floating(in.type->id())) { + if (options.nan_is_null && is_floating(in.type->id())) { switch (in.type->id()) { case Type::FLOAT: - *out_value = options.nan_is_null && - std::isnan(internal::UnboxScalar::Unbox(in)); + *out_value = std::isnan(internal::UnboxScalar::Unbox(in)); break; case Type::DOUBLE: - *out_value = options.nan_is_null && - std::isnan(internal::UnboxScalar::Unbox(in)); + *out_value = std::isnan(internal::UnboxScalar::Unbox(in)); break; default: - return Status::NotImplemented("Type not implemented"); + return Status::NotImplemented("NaN detection not implemented for type ", + in.type->ToString()); } } else { *out_value = false; @@ -106,42 +106,41 @@ struct IsNullOperator { } template - static void SetNanBits(const ArrayData& arr, ArrayData* out) { + static void SetNanBits(const ArrayData& arr, uint8_t* out_bitmap, int64_t out_offset) { const T* data = arr.GetValues(1); for (int64_t i = 0; i < arr.length; ++i) { if (std::isnan(data[i])) { - BitUtil::SetBit(out->buffers[1]->mutable_data(), i); + BitUtil::SetBit(out_bitmap, i + out_offset); } } } static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { - if (arr.MayHaveNulls()) { + const auto& options = NanOptionsState::Get(ctx); + + uint8_t* out_bitmap = out->buffers[1]->mutable_data(); + if (arr.GetNullCount() > 0) { // Input has nulls => output is the inverted null (validity) bitmap. - InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, - out->buffers[1]->mutable_data(), out->offset); + InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, out_bitmap, + out->offset); } else { // Input has no nulls => output is entirely false. - BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, - false); + BitUtil::SetBitsTo(out_bitmap, out->offset, out->length, false); } - if (is_floating(arr.type->id())) { - const auto options = NanNullState::Get(ctx); - if (options.nan_is_null) { - switch (arr.type->id()) { - case Type::FLOAT: - SetNanBits(arr, out); - break; - case Type::DOUBLE: - SetNanBits(arr, out); - break; - default: - return Status::NotImplemented("Type not implemented"); - } + if (is_floating(arr.type->id()) && options.nan_is_null) { + switch (arr.type->id()) { + case Type::FLOAT: + SetNanBits(arr, out_bitmap, out->offset); + break; + case Type::DOUBLE: + SetNanBits(arr, out_bitmap, out->offset); + break; + default: + return Status::NotImplemented("NaN detection not implemented for type ", + arr.type->ToString()); } } - return Status::OK(); } }; @@ -257,11 +256,10 @@ const FunctionDoc is_inf_doc( {"values"}); const FunctionDoc is_null_doc( - "Return true if null, NaN values can be considered as null", - ("For each input value, emit true if the value is null. Default behavior is to emit " - "false for NaN values. True can be emitted for NaN values by toggling " - "NanNullOptions flag."), - {"values"}, "NanNullOptions"); + "Return true if null (and optionally NaN)", + ("For each input value, emit true iff the value is null.\n" + "True may also be emitted for NaN values by setting the `nan_is_null` flag."), + {"values"}, "NullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -270,13 +268,13 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace void RegisterScalarValidity(FunctionRegistry* registry) { - static auto kNanNullOptions = NanNullOptions::Defaults(); + static auto kNullOptions = NullOptions::Defaults(); MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true, &kNanNullOptions, NanNullState::Init); + /*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 2ef04f5edd0..f361dad8a0c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -77,240 +77,120 @@ TEST_F(TestBooleanValidityKernels, ArrayIsNull) { "[true, false, false, true]"); } -TEST_F(TestFloatValidityKernels, FloatArrayIsNull) { - // Default NanNullOptions (nan_is_null = false) - NanNullOptions options; - // All NaN - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false]"), - &options); - // No NaN - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), - &options); - // Some NaNs - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), - &options); - - // Set nan_is_null value to true - options.nan_is_null = true; - // All NaN - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); - // No NaN - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), - &options); - // Some NaNs - CheckScalarUnary("is_null", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), - &options); -} - -TEST_F(TestDoubleValidityKernels, DoubleArrayIsNull) { - // All NaN - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false]")); - - NanNullOptions options{false}; - // No NaN - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), - &options); - // Some NaNs - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), - &options); - - options.nan_is_null = true; - // All NaN - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]"), &options); - // No NaN - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), - &options); - // Some NaNs - CheckScalarUnary("is_null", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, true]"), - &options); -} - -TEST_F(TestFloatValidityKernels, FloatScalarIsNull) { - CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false)); - CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(false)); - - NanNullOptions options{false}; - CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false), &options); - CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options); - - options.nan_is_null = true; - CheckScalarUnary("is_null", MakeScalar(42.0f), MakeScalar(false), &options); - CheckScalarUnary("is_null", MakeScalar(std::nanf("")), MakeScalar(true), &options); - CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false), &options); - CheckScalarUnary("is_null", MakeNullScalar(float32()), MakeScalar(true), &options); -} - -TEST_F(TestDoubleValidityKernels, DoubleScalarIsNull) { - NanNullOptions options{false}; - CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options); - CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(false), &options); - CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); - - options.nan_is_null = true; - CheckScalarUnary("is_null", MakeScalar(42.0), MakeScalar(false), &options); - CheckScalarUnary("is_null", MakeScalar(std::nan("")), MakeScalar(true), &options); - CheckScalarUnary("is_null", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false), &options); - CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true), &options); -} - TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]"); std::shared_ptr result = (*IsNull(arr)).array(); ASSERT_EQ(result->null_count, 0); } -TEST_F(TestFloatValidityKernels, FloatArrayIsFinite) { - // All Inf - CheckScalarUnary("is_finite", ArrayFromJSON(float32(), "[Inf, -Inf, Inf, -Inf, Inf]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false]")); - // No Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[true, true, true, true, false, null]")); - // Some Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float32(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[true, false, true, false, false, null]")); -} - -TEST_F(TestDoubleValidityKernels, DoubleArrayIsFinite) { - // All Inf - CheckScalarUnary("is_finite", ArrayFromJSON(float64(), "[Inf, -Inf, Inf, -Inf, Inf]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false]")); - // No Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[true, true, true, true, false, null]")); - // Some Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float64(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[true, false, true, false, false, null]")); -} - -TEST_F(TestFloatValidityKernels, FloatScalarIsFinite) { - CheckScalarUnary("is_finite", MakeNullScalar(float32()), MakeNullScalar(boolean())); - CheckScalarUnary("is_finite", MakeScalar(42.0f), MakeScalar(true)); - CheckScalarUnary("is_finite", MakeScalar(std::nanf("")), MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); -} - -TEST_F(TestDoubleValidityKernels, DoubleScalarIsFinite) { - CheckScalarUnary("is_finite", MakeNullScalar(float64()), MakeNullScalar(boolean())); - CheckScalarUnary("is_finite", MakeScalar(42.0), MakeScalar(true)); - CheckScalarUnary("is_finite", MakeScalar(std::nan("")), MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); -} +template +class TestFloatingPointValidityKernels : public TestValidityKernels { + public: + void TestIsNull() { + NullOptions default_options; + NullOptions nan_is_null_options(/*nan_is_null=*/true); + + auto ty = this->type_singleton(); + auto arr = ArrayFromJSON(ty, "[]"); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[]")); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[]"), &default_options); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[]"), + &nan_is_null_options); + + // Without nulls + arr = ArrayFromJSON(ty, "[1.5, 0.0, -0.0, Inf, -Inf, NaN]"); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, false, false, false, false]")); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, false, false, false, false]"), + &default_options); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &nan_is_null_options); + + // With nulls + arr = ArrayFromJSON(ty, "[1.5, -0.0, null, Inf, -Inf, NaN]"); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, true, false, false, false]")); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, true, false, false, false]"), + &default_options); + CheckScalarUnary("is_null", arr, + ArrayFromJSON(boolean(), "[false, false, true, false, false, true]"), + &nan_is_null_options); + + // Only nulls + arr = ArrayFromJSON(ty, "[null, null, null]"); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[true, true, true]")); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[true, true, true]"), + &default_options); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[true, true, true]"), + &nan_is_null_options); + } -TEST_F(TestFloatValidityKernels, FloatArrayIsInf) { - // All Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float32(), "[Inf, -Inf, Inf, -Inf, Inf]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some Infs - CheckScalarUnary("is_inf", ArrayFromJSON(float32(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} + void TestIsFinite() { + auto ty = this->type_singleton(); + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[]"), + ArrayFromJSON(boolean(), "[]")); + + // All Inf + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[Inf, -Inf, Inf, -Inf, Inf]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No Inf + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[0.0, 1.0, 2.0, 3.0, NaN, null]"), + ArrayFromJSON(boolean(), "[true, true, true, true, false, null]")); + // Some Inf + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[0.0, Inf, 2.0, -Inf, NaN, null]"), + ArrayFromJSON(boolean(), "[true, false, true, false, false, null]")); + } -TEST_F(TestDoubleValidityKernels, DoubleArrayIsInf) { - // All Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float64(), "[Inf, -Inf, Inf, -Inf, Inf]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some Infs - CheckScalarUnary("is_inf", ArrayFromJSON(float64(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} + void TestIsInf() { + auto ty = this->type_singleton(); + CheckScalarUnary("is_inf", ArrayFromJSON(ty, "[]"), ArrayFromJSON(boolean(), "[]")); + + // All Inf + CheckScalarUnary("is_inf", ArrayFromJSON(ty, "[Inf, -Inf, Inf, -Inf, Inf]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + // No Inf + CheckScalarUnary( + "is_inf", ArrayFromJSON(ty, "[0.0, 1.0, 2.0, 3.0, NaN, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); + // Some Inf + CheckScalarUnary("is_inf", ArrayFromJSON(ty, "[0.0, Inf, 2.0, -Inf, NaN, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); + } -TEST_F(TestFloatValidityKernels, FloatScalarIsInf) { - CheckScalarUnary("is_inf", MakeNullScalar(float32()), MakeNullScalar(boolean())); - CheckScalarUnary("is_inf", MakeScalar(42.0f), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::nanf("")), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(true)); - CheckScalarUnary("is_inf", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(true)); -} + void TestIsNan() { + auto ty = this->type_singleton(); + CheckScalarUnary("is_nan", ArrayFromJSON(ty, "[]"), ArrayFromJSON(boolean(), "[]")); + + // All NaN + CheckScalarUnary("is_nan", ArrayFromJSON(ty, "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + // No NaN + CheckScalarUnary( + "is_nan", ArrayFromJSON(ty, "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); + // Some NaNs + CheckScalarUnary("is_nan", ArrayFromJSON(ty, "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); + } +}; -TEST_F(TestDoubleValidityKernels, DoubleScalarIsInf) { - CheckScalarUnary("is_inf", MakeNullScalar(float64()), MakeNullScalar(boolean())); - CheckScalarUnary("is_inf", MakeScalar(42.0), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::nan("")), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(true)); - CheckScalarUnary("is_inf", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(true)); -} +TYPED_TEST_SUITE(TestFloatingPointValidityKernels, RealArrowTypes); -TEST_F(TestFloatValidityKernels, FloatArrayIsNan) { - // All NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some NaNs - CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsNull) { this->TestIsNull(); } -TEST_F(TestDoubleValidityKernels, DoubleArrayIsNan) { - // All NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some NaNs - CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsFinite) { this->TestIsFinite(); } -TEST_F(TestFloatValidityKernels, FloatScalarIsNan) { - CheckScalarUnary("is_nan", MakeNullScalar(float32()), MakeNullScalar(boolean())); - CheckScalarUnary("is_nan", MakeScalar(42.0f), MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(std::nanf("")), MakeScalar(true)); - CheckScalarUnary("is_nan", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsInf) { this->TestIsInf(); } -TEST_F(TestDoubleValidityKernels, DoubleScalarIsNan) { - CheckScalarUnary("is_nan", MakeNullScalar(float64()), MakeNullScalar(boolean())); - CheckScalarUnary("is_nan", MakeScalar(42.0), MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(std::nan("")), MakeScalar(true)); - CheckScalarUnary("is_nan", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsNan) { this->TestIsNan(); } } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index f2b93958b82..653fb224e50 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -987,49 +987,66 @@ in reverse order. as given by :struct:`SliceOptions` where ``start`` and ``stop`` are measured in codeunits. Null inputs emit null. -.. _cpp-compute-scalar-structural-transforms: +Categorizations +~~~~~~~~~~~~~~~ + ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===================+============+=====================+=====================+========================+=========+ +| is_finite | Unary | Float, Double | Boolean | | \(1) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_inf | Unary | Float, Double | Boolean | | \(2) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_nan | Unary | Float, Double | Boolean | | \(3) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_null | Unary | Any | Boolean | :struct:`NullOptions` | \(4) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_valid | Unary | Any | Boolean | | \(5) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ + +* \(1) Output is true iff the corresponding input element is finite (neither Infinity, + -Infinity, nor NaN). + +* \(2) Output is true iff the corresponding input element is Infinity/-Infinity. + +* \(3) Output is true iff the corresponding input element is NaN. + +* \(4) Output is true iff the corresponding input element is null. NaN values + can also be considered null by setting :struct:`NullOptions::nan_is_null`. + +* \(5) Output is true iff the corresponding input element is non-null. -Structural transforms -~~~~~~~~~~~~~~~~~~~~~ -.. XXX (this category is a bit of a hodgepodge) - -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| Function name | Arity | Input types | Output type | Notes | -+==========================+============+===================================================+=====================+=========+ -| case_when | Varargs | Struct of Boolean (Arg 0), Any (rest) | Input type | \(1) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| choose | Varargs | Integral (Arg 0); Fixed-width/Binary-like (rest) | Input type | \(2) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| coalesce | Varargs | Any | Input type | \(3) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(4) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | \(5) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_finite | Unary | Float, Double | Boolean | \(6) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_inf | Unary | Float, Double | Boolean | \(7) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_nan | Unary | Float, Double | Boolean | \(8) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_null | Unary | Any | Boolean | \(9) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_valid | Unary | Any | Boolean | \(10) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| list_value_length | Unary | List-like | Int32 or Int64 | \(11) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| make_struct | Varargs | Any | Struct | \(12) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ - -* \(1) This function acts like a SQL 'case when' statement or switch-case. The +.. _cpp-compute-scalar-selections: + +Selecting / multiplexing +~~~~~~~~~~~~~~~~~~~~~~~~ + +For each "row" of input values, these functions emit one of the input values, +depending on a condition. + ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==================+============+===================================================+=====================+=========+ +| case_when | Varargs | Struct of Boolean (Arg 0), Any (rest) | Input type | \(1) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| choose | Varargs | Integral (Arg 0); Fixed-width/Binary-like (rest) | Input type | \(2) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| coalesce | Varargs | Any | Input type | \(3) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(4) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | \(5) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ + +* \(1) This function acts like a SQL "case when" statement or switch-case. The input is a "condition" value, which is a struct of Booleans, followed by the values for each "branch". There must be either exactly one value argument for each child of the condition struct, or one more value argument than children - (in which case we have an 'else' or 'default' value). The output is of the + (in which case we have an "else" or "default" value). The output is of the same type as the value inputs; each row will be the corresponding value from the first value datum for which the corresponding Boolean is true, or the - corresponding value from the 'default' input, or null otherwise. + corresponding value from the "default" input, or null otherwise. Note that currently, while all types are supported, dictionaries will be unpacked. @@ -1057,22 +1074,21 @@ Structural transforms Also see: :ref:`replace_with_mask `. -* \(6) Output is true iff the corresponding input element is finite (not Infinity, - -Infinity, or NaN). - -* \(7) Output is true iff the corresponding input element is Infinity/-Infinity. - -* \(8) Output is true iff the corresponding input element is NaN. - -* \(9) Output is true if the corresponding input element is null. NaN values - can be considered as null via the :struct:`NanNullOptions`. +Structural transforms +~~~~~~~~~~~~~~~~~~~~~ -* \(10) Output is true iff the corresponding input element is non-null. ++--------------------------+------------+----------------+-------------------+------------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++==========================+============+================+===================+==============================+=========+ +| list_value_length | Unary | List-like | Int32 or Int64 | | \(1) | ++--------------------------+------------+----------------+-------------------+------------------------------+---------+ +| make_struct | Varargs | Any | Struct | :struct:`MakeStructOptions` | \(2) | ++--------------------------+------------+----------------+-------------------+------------------------------+---------+ -* \(11) Each output element is the length of the corresponding input element +* \(1) Each output element is the length of the corresponding input element (null if input is null). Output type is Int32 for List, Int64 for LargeList. -* \(12) The output struct's field types are the types of its arguments. The +* \(2) The output struct's field types are the types of its arguments. The field names are specified using an instance of :struct:`MakeStructOptions`. The output shape will be scalar if all inputs are scalar, otherwise any scalars will be broadcast to arrays. @@ -1367,4 +1383,4 @@ replaced, based on the remaining inputs. is true is replaced with the next value from input 3. A null in input 2 results in a corresponding null in the output. - Also see: :ref:`if_else `. + Also see: :ref:`if_else `. diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 9c15eaefc5f..99ad14496ca 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1005,14 +1005,14 @@ class DayOfWeekOptions(_DayOfWeekOptions): self._set_options(one_based_numbering, week_start) -cdef class _NanNullOptions(FunctionOptions): +cdef class _NullOptions(FunctionOptions): def _set_options(self, nan_is_null): self.wrapped.reset( - new CNanNullOptions(nan_is_null) + new CNullOptions(nan_is_null) ) -class NanNullOptions(_NanNullOptions): +class NullOptions(_NullOptions): def __init__(self, nan_is_null=False): self._set_options(nan_is_null) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 4ad893f4764..7342d9c57f5 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -239,7 +239,7 @@ cdef class Expression(_Weakrefable): cdef: shared_ptr[CFunctionOptions] c_options - c_options.reset(new CNanNullOptions(nan_is_null)) + c_options.reset(new CNullOptions(nan_is_null)) return Expression._call("is_null", [self], c_options) def cast(self, type, bint safe=True): diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 01f186bad94..558184588d2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1039,11 +1039,20 @@ cdef class Array(_PandasConvertible): else: return 0 - def is_null(self, nan_is_null=False): + def is_null(self, *, nan_is_null=False): """ Return BooleanArray indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array """ - options = _pc().NanNullOptions(nan_is_null) + options = _pc().NullOptions(nan_is_null) return _pc().call_function('is_null', [self], options) def is_valid(self): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index e35801b49e6..1a017ea2ef4 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -32,6 +32,7 @@ ArraySortOptions, CastOptions, CountOptions, + DayOfWeekOptions, DictionaryEncodeOptions, ElementWiseAggregateOptions, ExtractRegexOptions, @@ -43,6 +44,7 @@ PadOptions, PartitionNthOptions, MakeStructOptions, + NullOptions, QuantileOptions, ReplaceSliceOptions, ReplaceSubstringOptions, @@ -52,10 +54,8 @@ SortOptions, SplitOptions, SplitPatternOptions, - StrptimeOptions, StrftimeOptions, - DayOfWeekOptions, - NanNullOptions, + StrptimeOptions, TakeOptions, TDigestOptions, TrimOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 67a9d4cb6b0..2a972e3f6c3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1961,9 +1961,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: c_bool one_based_numbering uint32_t week_start - cdef cppclass CNanNullOptions \ - "arrow::compute::NanNullOptions"(CFunctionOptions): - CNanNullOptions(c_bool nan_is_null) + cdef cppclass CNullOptions \ + "arrow::compute::NullOptions"(CFunctionOptions): + CNullOptions(c_bool nan_is_null) c_bool nan_is_null cdef cppclass CVarianceOptions \ diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 1dc28307878..247f6d67baa 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -170,16 +170,25 @@ cdef class ChunkedArray(_PandasConvertible): else: index -= self.chunked_array.chunk(j).get().length() - def is_null(self, nan_is_null=False): + def is_null(self, *, nan_is_null=False): """ - Return BooleanArray indicating the null values. + Return boolean array indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array or ChunkedArray """ - options = _pc().NanNullOptions(nan_is_null) + options = _pc().NullOptions(nan_is_null) return _pc().call_function('is_null', [self], options) def is_valid(self): """ - Return BooleanArray indicating the non-null values. + Return boolean array indicating the non-null values. """ return _pc().is_valid(self) diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 36d073ed351..8a12e38cc22 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -223,7 +223,7 @@ std::shared_ptr make_compute_options( } if (func_name == "is_null") { - using Options = arrow::compute::NanNullOptions; + using Options = arrow::compute::NullOptions; auto out = std::make_shared(Options::Defaults()); if (!Rf_isNull(options["nan_is_null"])) { out->nan_is_null = cpp11::as_cpp(options["nan_is_null"]); From f09d52a451a2900b7afe0d2d19a6145f910b889d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 26 Aug 2021 16:22:41 +0200 Subject: [PATCH 66/66] Some more test cleanup --- .../compute/kernels/scalar_validity_test.cc | 54 +++++++++++-------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index f361dad8a0c..35a6b831ef4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -40,8 +40,6 @@ class TestValidityKernels : public ::testing::Test { }; using TestBooleanValidityKernels = TestValidityKernels; -using TestFloatValidityKernels = TestValidityKernels; -using TestDoubleValidityKernels = TestValidityKernels; TEST_F(TestBooleanValidityKernels, ArrayIsValid) { CheckScalarUnary("is_valid", type_singleton(), "[]", type_singleton(), "[]"); @@ -51,36 +49,48 @@ TEST_F(TestBooleanValidityKernels, ArrayIsValid) { "[false, true, true, false]"); } -TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) { - CheckScalarUnary("is_null", std::make_shared(5), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - CheckScalarUnary("is_valid", std::make_shared(5), - ArrayFromJSON(boolean(), "[false, false, false, false, false]")); -} - TEST_F(TestBooleanValidityKernels, ArrayIsValidBufferPassthruOptimization) { Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]"); ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg)); ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]); } -TEST_F(TestBooleanValidityKernels, ScalarIsValid) { - CheckScalarUnary("is_valid", MakeScalar(19.7), MakeScalar(true)); - CheckScalarUnary("is_valid", MakeNullScalar(float64()), MakeScalar(false)); -} +TEST_F(TestBooleanValidityKernels, IsNull) { + auto ty = type_singleton(); + NullOptions default_options; + NullOptions nan_is_null_options(/*nan_is_null=*/true); + + CheckScalarUnary("is_null", ty, "[]", boolean(), "[]"); + CheckScalarUnary("is_null", ty, "[]", boolean(), "[]", &default_options); + CheckScalarUnary("is_null", ty, "[]", boolean(), "[]", &nan_is_null_options); -TEST_F(TestBooleanValidityKernels, ArrayIsNull) { - CheckScalarUnary("is_null", type_singleton(), "[]", type_singleton(), "[]"); - CheckScalarUnary("is_null", type_singleton(), "[null]", type_singleton(), "[true]"); - CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); - CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), + CheckScalarUnary("is_null", ty, "[null]", boolean(), "[true]"); + CheckScalarUnary("is_null", ty, "[null]", boolean(), "[true]", &default_options); + CheckScalarUnary("is_null", ty, "[null]", boolean(), "[true]", &nan_is_null_options); + + CheckScalarUnary("is_null", ty, "[1]", boolean(), "[false]"); + CheckScalarUnary("is_null", ty, "[1]", boolean(), "[false]", &default_options); + CheckScalarUnary("is_null", ty, "[1]", boolean(), "[false]", &nan_is_null_options); + + CheckScalarUnary("is_null", ty, "[null, 1, 0, null]", boolean(), "[true, false, false, true]"); + CheckScalarUnary("is_null", ty, "[null, 1, 0, null]", boolean(), + "[true, false, false, true]", &default_options); + CheckScalarUnary("is_null", ty, "[null, 1, 0, null]", boolean(), + "[true, false, false, true]", &nan_is_null_options); +} + +TEST(TestValidityKernels, IsValidIsNullNullType) { + CheckScalarUnary("is_null", std::make_shared(5), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + CheckScalarUnary("is_valid", std::make_shared(5), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); } -TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { - auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]"); - std::shared_ptr result = (*IsNull(arr)).array(); - ASSERT_EQ(result->null_count, 0); +TEST(TestValidityKernels, IsNullSetsZeroNullCount) { + auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4, null]"); + ASSERT_OK_AND_ASSIGN(Datum out, IsNull(arr)); + ASSERT_EQ(out.array()->null_count, 0); } template