diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index b0aeec7c92e..599ce75bece 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -161,6 +161,8 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType( static auto kDayOfWeekOptionsType = GetFunctionOptionsType( DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering), DataMember("week_start", &DayOfWeekOptions::week_start)); +static auto kNullOptionsType = GetFunctionOptionsType( + DataMember("nan_is_null", &NullOptions::nan_is_null)); } // namespace } // namespace internal @@ -291,6 +293,10 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start week_start(week_start) {} constexpr char DayOfWeekOptions::kTypeName[]; +NullOptions::NullOptions(bool nan_is_null) + : FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {} +constexpr char NullOptions::kTypeName[]; + namespace internal { void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType)); @@ -310,6 +316,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType)); } } // namespace internal @@ -463,7 +470,6 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti // Validity functions SCALAR_EAGER_UNARY(IsValid, "is_valid") -SCALAR_EAGER_UNARY(IsNull, "is_null") SCALAR_EAGER_UNARY(IsNan, "is_nan") Result FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) { @@ -483,6 +489,10 @@ Result CaseWhen(const Datum& cond, const std::vector& cases, return CallFunction("case_when", args, ctx); } +Result IsNull(const Datum& arg, NullOptions options, ExecContext* ctx) { + return CallFunction("is_null", {arg}, &options, ctx); +} + // ---------------------------------------------------------------------- // Temporal functions diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index f1672a05223..769ab0e7874 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -224,6 +224,15 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { int64_t start, stop, step; }; +class ARROW_EXPORT NullOptions : public FunctionOptions { + public: + explicit NullOptions(bool nan_is_null = false); + constexpr static char const kTypeName[] = "NullOptions"; + static NullOptions Defaults() { return NullOptions{}; } + + bool nan_is_null = false; +}; + enum CompareOperator : int8_t { EQUAL, NOT_EQUAL, @@ -756,13 +765,15 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); /// false otherwise /// /// \param[in] values input to examine for nullity +/// \param[in] options NullOptions /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result IsNull(const Datum& values, ExecContext* ctx = NULLPTR); +Result IsNull(const Datum& values, NullOptions options = NullOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief IsNan returns true for each element of `values` that is NaN, /// false otherwise diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index 67a9f3c40ff..64e3305825d 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -1154,7 +1154,9 @@ Expression greater_equal(Expression lhs, Expression rhs) { return call("greater_equal", {std::move(lhs), std::move(rhs)}); } -Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); } +Expression is_null(Expression lhs, bool nan_is_null) { + return call("is_null", {std::move(lhs)}, compute::NullOptions(std::move(nan_is_null))); +} Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); } diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h index 3810accf70a..dac5728ab46 100644 --- a/cpp/src/arrow/compute/exec/expression.h +++ b/cpp/src/arrow/compute/exec/expression.h @@ -255,7 +255,7 @@ ARROW_EXPORT Expression greater(Expression lhs, Expression rhs); ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs); -ARROW_EXPORT Expression is_null(Expression lhs); +ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false); ARROW_EXPORT Expression is_valid(Expression lhs); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index ead88abc0f2..d23a909c6fd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/util/bit_util.h" @@ -74,21 +75,71 @@ struct IsInfOperator { } }; +using NanOptionsState = OptionsWrapper; + struct IsNullOperator { static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) { - checked_cast(out)->value = !in.is_valid; + const auto& options = NanOptionsState::Get(ctx); + bool* out_value = &checked_cast(out)->value; + + if (in.is_valid) { + if (options.nan_is_null && is_floating(in.type->id())) { + switch (in.type->id()) { + case Type::FLOAT: + *out_value = std::isnan(internal::UnboxScalar::Unbox(in)); + break; + case Type::DOUBLE: + *out_value = std::isnan(internal::UnboxScalar::Unbox(in)); + break; + default: + return Status::NotImplemented("NaN detection not implemented for type ", + in.type->ToString()); + } + } else { + *out_value = false; + } + } else { + *out_value = true; + } + return Status::OK(); } + template + static void SetNanBits(const ArrayData& arr, uint8_t* out_bitmap, int64_t out_offset) { + const T* data = arr.GetValues(1); + for (int64_t i = 0; i < arr.length; ++i) { + if (std::isnan(data[i])) { + BitUtil::SetBit(out_bitmap, i + out_offset); + } + } + } + static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) { - if (arr.MayHaveNulls()) { + const auto& options = NanOptionsState::Get(ctx); + + uint8_t* out_bitmap = out->buffers[1]->mutable_data(); + if (arr.GetNullCount() > 0) { // Input has nulls => output is the inverted null (validity) bitmap. - InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, - out->buffers[1]->mutable_data(), out->offset); + InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, out_bitmap, + out->offset); } else { // Input has no nulls => output is entirely false. - BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, - false); + BitUtil::SetBitsTo(out_bitmap, out->offset, out->length, false); + } + + if (is_floating(arr.type->id()) && options.nan_is_null) { + switch (arr.type->id()) { + case Type::FLOAT: + SetNanBits(arr, out_bitmap, out->offset); + break; + case Type::DOUBLE: + SetNanBits(arr, out_bitmap, out->offset); + break; + default: + return Status::NotImplemented("NaN detection not implemented for type ", + arr.type->ToString()); + } } return Status::OK(); } @@ -104,11 +155,13 @@ struct IsNanOperator { void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, - MemAllocation::type mem_allocation, bool can_write_into_slices) { + MemAllocation::type mem_allocation, bool can_write_into_slices, + const FunctionOptions* default_options = NULLPTR, + KernelInit init = NULLPTR) { Arity arity{static_cast(in_types.size())}; - auto func = std::make_shared(name, arity, doc); + auto func = std::make_shared(name, arity, doc, default_options); - ScalarKernel kernel(std::move(in_types), out_type, exec); + ScalarKernel kernel(std::move(in_types), out_type, exec, init); kernel.null_handling = NullHandling::OUTPUT_NOT_NULL; kernel.can_write_into_slices = can_write_into_slices; kernel.mem_allocation = mem_allocation; @@ -202,9 +255,11 @@ const FunctionDoc is_inf_doc( ("For each input value, emit true iff the value is infinite (inf or -inf)."), {"values"}); -const FunctionDoc is_null_doc("Return true if null", - ("For each input value, emit true iff the value is null."), - {"values"}); +const FunctionDoc is_null_doc( + "Return true if null (and optionally NaN)", + ("For each input value, emit true iff the value is null.\n" + "True may also be emitted for NaN values by setting the `nan_is_null` flag."), + {"values"}, "NullOptions"); const FunctionDoc is_nan_doc("Return true if NaN", ("For each input value, emit true iff the value is NaN."), @@ -213,12 +268,13 @@ const FunctionDoc is_nan_doc("Return true if NaN", } // namespace void RegisterScalarValidity(FunctionRegistry* registry) { + static auto kNullOptions = NullOptions::Defaults(); MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec, registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false); MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, - /*can_write_into_slices=*/true); + /*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init); DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc))); DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc))); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 1a7a1cbda15..35a6b831ef4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -40,8 +40,6 @@ class TestValidityKernels : public ::testing::Test { }; using TestBooleanValidityKernels = TestValidityKernels; -using TestFloatValidityKernels = TestValidityKernels; -using TestDoubleValidityKernels = TestValidityKernels; TEST_F(TestBooleanValidityKernels, ArrayIsValid) { CheckScalarUnary("is_valid", type_singleton(), "[]", type_singleton(), "[]"); @@ -51,178 +49,158 @@ TEST_F(TestBooleanValidityKernels, ArrayIsValid) { "[false, true, true, false]"); } -TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) { - CheckScalarUnary("is_null", std::make_shared(5), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - CheckScalarUnary("is_valid", std::make_shared(5), - ArrayFromJSON(boolean(), "[false, false, false, false, false]")); -} - TEST_F(TestBooleanValidityKernels, ArrayIsValidBufferPassthruOptimization) { Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]"); ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg)); ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]); } -TEST_F(TestBooleanValidityKernels, ScalarIsValid) { - CheckScalarUnary("is_valid", MakeScalar(19.7), MakeScalar(true)); - CheckScalarUnary("is_valid", MakeNullScalar(float64()), MakeScalar(false)); -} +TEST_F(TestBooleanValidityKernels, IsNull) { + auto ty = type_singleton(); + NullOptions default_options; + NullOptions nan_is_null_options(/*nan_is_null=*/true); -TEST_F(TestBooleanValidityKernels, ArrayIsNull) { - CheckScalarUnary("is_null", type_singleton(), "[]", type_singleton(), "[]"); - CheckScalarUnary("is_null", type_singleton(), "[null]", type_singleton(), "[true]"); - CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); - CheckScalarUnary("is_null", type_singleton(), "[null, 1, 0, null]", type_singleton(), - "[true, false, false, true]"); -} + CheckScalarUnary("is_null", ty, "[]", boolean(), "[]"); + CheckScalarUnary("is_null", ty, "[]", boolean(), "[]", &default_options); + CheckScalarUnary("is_null", ty, "[]", boolean(), "[]", &nan_is_null_options); -TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { - auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]"); - std::shared_ptr result = (*IsNull(arr)).array(); - ASSERT_EQ(result->null_count, 0); -} + CheckScalarUnary("is_null", ty, "[null]", boolean(), "[true]"); + CheckScalarUnary("is_null", ty, "[null]", boolean(), "[true]", &default_options); + CheckScalarUnary("is_null", ty, "[null]", boolean(), "[true]", &nan_is_null_options); -TEST_F(TestBooleanValidityKernels, ScalarIsNull) { - CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false)); - CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); -} + CheckScalarUnary("is_null", ty, "[1]", boolean(), "[false]"); + CheckScalarUnary("is_null", ty, "[1]", boolean(), "[false]", &default_options); + CheckScalarUnary("is_null", ty, "[1]", boolean(), "[false]", &nan_is_null_options); -TEST_F(TestFloatValidityKernels, FloatArrayIsFinite) { - // All Inf - CheckScalarUnary("is_finite", ArrayFromJSON(float32(), "[Inf, -Inf, Inf, -Inf, Inf]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false]")); - // No Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[true, true, true, true, false, null]")); - // Some Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float32(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[true, false, true, false, false, null]")); + CheckScalarUnary("is_null", ty, "[null, 1, 0, null]", boolean(), + "[true, false, false, true]"); + CheckScalarUnary("is_null", ty, "[null, 1, 0, null]", boolean(), + "[true, false, false, true]", &default_options); + CheckScalarUnary("is_null", ty, "[null, 1, 0, null]", boolean(), + "[true, false, false, true]", &nan_is_null_options); } -TEST_F(TestDoubleValidityKernels, DoubleArrayIsFinite) { - // All Inf - CheckScalarUnary("is_finite", ArrayFromJSON(float64(), "[Inf, -Inf, Inf, -Inf, Inf]"), +TEST(TestValidityKernels, IsValidIsNullNullType) { + CheckScalarUnary("is_null", std::make_shared(5), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + CheckScalarUnary("is_valid", std::make_shared(5), ArrayFromJSON(boolean(), "[false, false, false, false, false]")); - // No Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[true, true, true, true, false, null]")); - // Some Inf - CheckScalarUnary("is_finite", - ArrayFromJSON(float64(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[true, false, true, false, false, null]")); } -TEST_F(TestFloatValidityKernels, FloatScalarIsFinite) { - CheckScalarUnary("is_finite", MakeNullScalar(float32()), MakeNullScalar(boolean())); - CheckScalarUnary("is_finite", MakeScalar(42.0f), MakeScalar(true)); - CheckScalarUnary("is_finite", MakeScalar(std::nanf("")), MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); +TEST(TestValidityKernels, IsNullSetsZeroNullCount) { + auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4, null]"); + ASSERT_OK_AND_ASSIGN(Datum out, IsNull(arr)); + ASSERT_EQ(out.array()->null_count, 0); } -TEST_F(TestDoubleValidityKernels, DoubleScalarIsFinite) { - CheckScalarUnary("is_finite", MakeNullScalar(float64()), MakeNullScalar(boolean())); - CheckScalarUnary("is_finite", MakeScalar(42.0), MakeScalar(true)); - CheckScalarUnary("is_finite", MakeScalar(std::nan("")), MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_finite", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); -} +template +class TestFloatingPointValidityKernels : public TestValidityKernels { + public: + void TestIsNull() { + NullOptions default_options; + NullOptions nan_is_null_options(/*nan_is_null=*/true); + + auto ty = this->type_singleton(); + auto arr = ArrayFromJSON(ty, "[]"); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[]")); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[]"), &default_options); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[]"), + &nan_is_null_options); + + // Without nulls + arr = ArrayFromJSON(ty, "[1.5, 0.0, -0.0, Inf, -Inf, NaN]"); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, false, false, false, false]")); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, false, false, false, false]"), + &default_options); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, false, false, false, true]"), + &nan_is_null_options); + + // With nulls + arr = ArrayFromJSON(ty, "[1.5, -0.0, null, Inf, -Inf, NaN]"); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, true, false, false, false]")); + CheckScalarUnary( + "is_null", arr, + ArrayFromJSON(boolean(), "[false, false, true, false, false, false]"), + &default_options); + CheckScalarUnary("is_null", arr, + ArrayFromJSON(boolean(), "[false, false, true, false, false, true]"), + &nan_is_null_options); + + // Only nulls + arr = ArrayFromJSON(ty, "[null, null, null]"); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[true, true, true]")); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[true, true, true]"), + &default_options); + CheckScalarUnary("is_null", arr, ArrayFromJSON(boolean(), "[true, true, true]"), + &nan_is_null_options); + } -TEST_F(TestFloatValidityKernels, FloatArrayIsInf) { - // All Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float32(), "[Inf, -Inf, Inf, -Inf, Inf]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some Infs - CheckScalarUnary("is_inf", ArrayFromJSON(float32(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} + void TestIsFinite() { + auto ty = this->type_singleton(); + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[]"), + ArrayFromJSON(boolean(), "[]")); + + // All Inf + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[Inf, -Inf, Inf, -Inf, Inf]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false]")); + // No Inf + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[0.0, 1.0, 2.0, 3.0, NaN, null]"), + ArrayFromJSON(boolean(), "[true, true, true, true, false, null]")); + // Some Inf + CheckScalarUnary("is_finite", ArrayFromJSON(ty, "[0.0, Inf, 2.0, -Inf, NaN, null]"), + ArrayFromJSON(boolean(), "[true, false, true, false, false, null]")); + } -TEST_F(TestDoubleValidityKernels, DoubleArrayIsInf) { - // All Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float64(), "[Inf, -Inf, Inf, -Inf, Inf]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No Inf - CheckScalarUnary("is_inf", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, NaN, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some Infs - CheckScalarUnary("is_inf", ArrayFromJSON(float64(), "[0.0, Inf, 2.0, -Inf, NaN, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} + void TestIsInf() { + auto ty = this->type_singleton(); + CheckScalarUnary("is_inf", ArrayFromJSON(ty, "[]"), ArrayFromJSON(boolean(), "[]")); + + // All Inf + CheckScalarUnary("is_inf", ArrayFromJSON(ty, "[Inf, -Inf, Inf, -Inf, Inf]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + // No Inf + CheckScalarUnary( + "is_inf", ArrayFromJSON(ty, "[0.0, 1.0, 2.0, 3.0, NaN, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); + // Some Inf + CheckScalarUnary("is_inf", ArrayFromJSON(ty, "[0.0, Inf, 2.0, -Inf, NaN, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); + } -TEST_F(TestFloatValidityKernels, FloatScalarIsInf) { - CheckScalarUnary("is_inf", MakeNullScalar(float32()), MakeNullScalar(boolean())); - CheckScalarUnary("is_inf", MakeScalar(42.0f), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::nanf("")), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(true)); - CheckScalarUnary("is_inf", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(true)); -} + void TestIsNan() { + auto ty = this->type_singleton(); + CheckScalarUnary("is_nan", ArrayFromJSON(ty, "[]"), ArrayFromJSON(boolean(), "[]")); + + // All NaN + CheckScalarUnary("is_nan", ArrayFromJSON(ty, "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + // No NaN + CheckScalarUnary( + "is_nan", ArrayFromJSON(ty, "[0.0, 1.0, 2.0, 3.0, Inf, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); + // Some NaNs + CheckScalarUnary("is_nan", ArrayFromJSON(ty, "[0.0, NaN, 2.0, NaN, Inf, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); + } +}; -TEST_F(TestDoubleValidityKernels, DoubleScalarIsInf) { - CheckScalarUnary("is_inf", MakeNullScalar(float64()), MakeNullScalar(boolean())); - CheckScalarUnary("is_inf", MakeScalar(42.0), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::nan("")), MakeScalar(false)); - CheckScalarUnary("is_inf", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(true)); - CheckScalarUnary("is_inf", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(true)); -} +TYPED_TEST_SUITE(TestFloatingPointValidityKernels, RealArrowTypes); -TEST_F(TestFloatValidityKernels, FloatArrayIsNan) { - // All NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some NaNs - CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsNull) { this->TestIsNull(); } -TEST_F(TestDoubleValidityKernels, DoubleArrayIsNan) { - // All NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), - ArrayFromJSON(boolean(), "[true, true, true, true, true]")); - // No NaN - CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, Inf, null]"), - ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); - // Some NaNs - CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, Inf, null]"), - ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsFinite) { this->TestIsFinite(); } -TEST_F(TestFloatValidityKernels, FloatScalarIsNan) { - CheckScalarUnary("is_nan", MakeNullScalar(float32()), MakeNullScalar(boolean())); - CheckScalarUnary("is_nan", MakeScalar(42.0f), MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(std::nanf("")), MakeScalar(true)); - CheckScalarUnary("is_nan", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsInf) { this->TestIsInf(); } -TEST_F(TestDoubleValidityKernels, DoubleScalarIsNan) { - CheckScalarUnary("is_nan", MakeNullScalar(float64()), MakeNullScalar(boolean())); - CheckScalarUnary("is_nan", MakeScalar(42.0), MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(std::nan("")), MakeScalar(true)); - CheckScalarUnary("is_nan", MakeScalar(std::numeric_limits::infinity()), - MakeScalar(false)); - CheckScalarUnary("is_nan", MakeScalar(-std::numeric_limits::infinity()), - MakeScalar(false)); -} +TYPED_TEST(TestFloatingPointValidityKernels, IsNan) { this->TestIsNan(); } } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index c5ecbb419d1..653fb224e50 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -987,49 +987,66 @@ in reverse order. as given by :struct:`SliceOptions` where ``start`` and ``stop`` are measured in codeunits. Null inputs emit null. -.. _cpp-compute-scalar-structural-transforms: +Categorizations +~~~~~~~~~~~~~~~ + ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===================+============+=====================+=====================+========================+=========+ +| is_finite | Unary | Float, Double | Boolean | | \(1) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_inf | Unary | Float, Double | Boolean | | \(2) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_nan | Unary | Float, Double | Boolean | | \(3) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_null | Unary | Any | Boolean | :struct:`NullOptions` | \(4) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_valid | Unary | Any | Boolean | | \(5) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ + +* \(1) Output is true iff the corresponding input element is finite (neither Infinity, + -Infinity, nor NaN). + +* \(2) Output is true iff the corresponding input element is Infinity/-Infinity. + +* \(3) Output is true iff the corresponding input element is NaN. + +* \(4) Output is true iff the corresponding input element is null. NaN values + can also be considered null by setting :struct:`NullOptions::nan_is_null`. + +* \(5) Output is true iff the corresponding input element is non-null. -Structural transforms -~~~~~~~~~~~~~~~~~~~~~ -.. XXX (this category is a bit of a hodgepodge) - -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| Function name | Arity | Input types | Output type | Notes | -+==========================+============+===================================================+=====================+=========+ -| case_when | Varargs | Struct of Boolean (Arg 0), Any (rest) | Input type | \(1) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| choose | Varargs | Integral (Arg 0); Fixed-width/Binary-like (rest) | Input type | \(2) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| coalesce | Varargs | Any | Input type | \(3) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(4) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | \(5) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_finite | Unary | Float, Double | Boolean | \(6) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_inf | Unary | Float, Double | Boolean | \(7) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_nan | Unary | Float, Double | Boolean | \(8) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_null | Unary | Any | Boolean | \(9) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| is_valid | Unary | Any | Boolean | \(10) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| list_value_length | Unary | List-like | Int32 or Int64 | \(11) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ -| make_struct | Varargs | Any | Struct | \(12) | -+--------------------------+------------+---------------------------------------------------+---------------------+---------+ - -* \(1) This function acts like a SQL 'case when' statement or switch-case. The +.. _cpp-compute-scalar-selections: + +Selecting / multiplexing +~~~~~~~~~~~~~~~~~~~~~~~~ + +For each "row" of input values, these functions emit one of the input values, +depending on a condition. + ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==================+============+===================================================+=====================+=========+ +| case_when | Varargs | Struct of Boolean (Arg 0), Any (rest) | Input type | \(1) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| choose | Varargs | Integral (Arg 0); Fixed-width/Binary-like (rest) | Input type | \(2) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| coalesce | Varargs | Any | Input type | \(3) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(4) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | \(5) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ + +* \(1) This function acts like a SQL "case when" statement or switch-case. The input is a "condition" value, which is a struct of Booleans, followed by the values for each "branch". There must be either exactly one value argument for each child of the condition struct, or one more value argument than children - (in which case we have an 'else' or 'default' value). The output is of the + (in which case we have an "else" or "default" value). The output is of the same type as the value inputs; each row will be the corresponding value from the first value datum for which the corresponding Boolean is true, or the - corresponding value from the 'default' input, or null otherwise. + corresponding value from the "default" input, or null otherwise. Note that currently, while all types are supported, dictionaries will be unpacked. @@ -1057,21 +1074,21 @@ Structural transforms Also see: :ref:`replace_with_mask `. -* \(6) Output is true iff the corresponding input element is finite (not Infinity, - -Infinity, or NaN). - -* \(7) Output is true iff the corresponding input element is Infinity/-Infinity. - -* \(8) Output is true iff the corresponding input element is NaN. - -* \(9) Output is true iff the corresponding input element is null. +Structural transforms +~~~~~~~~~~~~~~~~~~~~~ -* \(10) Output is true iff the corresponding input element is non-null. ++--------------------------+------------+----------------+-------------------+------------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++==========================+============+================+===================+==============================+=========+ +| list_value_length | Unary | List-like | Int32 or Int64 | | \(1) | ++--------------------------+------------+----------------+-------------------+------------------------------+---------+ +| make_struct | Varargs | Any | Struct | :struct:`MakeStructOptions` | \(2) | ++--------------------------+------------+----------------+-------------------+------------------------------+---------+ -* \(11) Each output element is the length of the corresponding input element +* \(1) Each output element is the length of the corresponding input element (null if input is null). Output type is Int32 for List, Int64 for LargeList. -* \(12) The output struct's field types are the types of its arguments. The +* \(2) The output struct's field types are the types of its arguments. The field names are specified using an instance of :struct:`MakeStructOptions`. The output shape will be scalar if all inputs are scalar, otherwise any scalars will be broadcast to arrays. @@ -1366,4 +1383,4 @@ replaced, based on the remaining inputs. is true is replaced with the next value from input 3. A null in input 2 results in a corresponding null in the output. - Also see: :ref:`if_else `. + Also see: :ref:`if_else `. diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 35a4ff696db..99ad14496ca 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1005,6 +1005,18 @@ class DayOfWeekOptions(_DayOfWeekOptions): self._set_options(one_based_numbering, week_start) +cdef class _NullOptions(FunctionOptions): + def _set_options(self, nan_is_null): + self.wrapped.reset( + new CNullOptions(nan_is_null) + ) + + +class NullOptions(_NullOptions): + def __init__(self, nan_is_null=False): + self._set_options(nan_is_null) + + cdef class _VarianceOptions(FunctionOptions): def _set_options(self, ddof): self.wrapped.reset(new CVarianceOptions(ddof)) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 945475bd7f1..7342d9c57f5 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -234,9 +234,13 @@ cdef class Expression(_Weakrefable): """Checks whether the expression is not-null (valid)""" return Expression._call("is_valid", [self]) - def is_null(self): + def is_null(self, bint nan_is_null=False): """Checks whether the expression is null""" - return Expression._call("is_null", [self]) + cdef: + shared_ptr[CFunctionOptions] c_options + + c_options.reset(new CNullOptions(nan_is_null)) + return Expression._call("is_null", [self], c_options) def cast(self, type, bint safe=True): """Explicitly change the expression's data type""" diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 455ec3bac06..558184588d2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1039,11 +1039,21 @@ cdef class Array(_PandasConvertible): else: return 0 - def is_null(self): + def is_null(self, *, nan_is_null=False): """ Return BooleanArray indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array """ - return _pc().is_null(self) + options = _pc().NullOptions(nan_is_null) + return _pc().call_function('is_null', [self], options) def is_valid(self): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index eea2328a49a..1a017ea2ef4 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -32,6 +32,7 @@ ArraySortOptions, CastOptions, CountOptions, + DayOfWeekOptions, DictionaryEncodeOptions, ElementWiseAggregateOptions, ExtractRegexOptions, @@ -43,6 +44,7 @@ PadOptions, PartitionNthOptions, MakeStructOptions, + NullOptions, QuantileOptions, ReplaceSliceOptions, ReplaceSubstringOptions, @@ -52,9 +54,8 @@ SortOptions, SplitOptions, SplitPatternOptions, - StrptimeOptions, StrftimeOptions, - DayOfWeekOptions, + StrptimeOptions, TakeOptions, TDigestOptions, TrimOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 025eeef18af..2a972e3f6c3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1961,6 +1961,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: c_bool one_based_numbering uint32_t week_start + cdef cppclass CNullOptions \ + "arrow::compute::NullOptions"(CFunctionOptions): + CNullOptions(c_bool nan_is_null) + c_bool nan_is_null + cdef cppclass CVarianceOptions \ "arrow::compute::VarianceOptions"(CFunctionOptions): CVarianceOptions(int ddof) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 8c1355d2016..247f6d67baa 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -170,15 +170,25 @@ cdef class ChunkedArray(_PandasConvertible): else: index -= self.chunked_array.chunk(j).get().length() - def is_null(self): + def is_null(self, *, nan_is_null=False): """ - Return BooleanArray indicating the null values. + Return boolean array indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array or ChunkedArray """ - return _pc().is_null(self) + options = _pc().NullOptions(nan_is_null) + return _pc().call_function('is_null', [self], options) def is_valid(self): """ - Return BooleanArray indicating the non-null values. + Return boolean array indicating the non-null values. """ return _pc().is_valid(self) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index cac7a3c95e5..3c05a3071e3 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1310,7 +1310,6 @@ def test_arithmetic_multiply(): def test_is_null(): arr = pa.array([1, 2, 3, None]) result = arr.is_null() - result = arr.is_null() expected = pa.array([False, False, False, True]) assert result.equals(expected) assert result.equals(pc.is_null(arr)) @@ -1327,6 +1326,15 @@ def test_is_null(): expected = pa.chunked_array([[True, True], [True, False]]) assert result.equals(expected) + arr = pa.array([1, 2, 3, None, np.nan]) + result = arr.is_null() + expected = pa.array([False, False, False, True, False]) + assert result.equals(expected) + + result = arr.is_null(nan_is_null=True) + expected = pa.array([False, False, False, True, True]) + assert result.equals(expected) + def test_fill_null(): arr = pa.array([1, 2, None, 4], type=pa.int8()) diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R index b3635f239c4..b1ff18fe434 100644 --- a/r/R/arrow-datum.R +++ b/r/R/arrow-datum.R @@ -49,13 +49,7 @@ is.infinite.ArrowDatum <- function(x) { #' @export is.na.ArrowDatum <- function(x) { - # TODO: if an option is added to the is_null kernel to treat NaN as NA, - # use that to simplify the code here (ARROW-13367) - if (x$type_id() %in% TYPES_WITH_NAN) { - call_function("is_nan", x) | call_function("is_null", x) - } else { - call_function("is_null", x) - } + call_function("is_null", x, options = list(nan_is_null = TRUE)) } #' @export diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index efba9f287f9..a6b47eede0d 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -96,14 +96,7 @@ nse_funcs$coalesce <- function(...) { } nse_funcs$is.na <- function(x) { - # TODO: if an option is added to the is_null kernel to treat NaN as NA, - # use that to simplify the code here (ARROW-13367) - if (is.double(x) || (inherits(x, "Expression") && - x$type_id() %in% TYPES_WITH_NAN)) { - build_expr("is_nan", x) | build_expr("is_null", x) - } else { - build_expr("is_null", x) - } + build_expr("is_null", x, options = list(nan_is_null = TRUE)) } nse_funcs$is.nan <- function(x) { diff --git a/r/R/expression.R b/r/R/expression.R index 0526eb73bc9..57466cc3c71 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -231,11 +231,5 @@ Ops.Expression <- function(e1, e2) { #' @export is.na.Expression <- function(x) { - if (!is.null(x$schema) && x$type_id() %in% TYPES_WITH_NAN) { - # TODO: if an option is added to the is_null kernel to treat NaN as NA, - # use that to simplify the code here (ARROW-13367) - Expression$create("is_nan", x) | build_expr("is_null", x) - } else { - Expression$create("is_null", x) - } + Expression$create("is_null", x, options = list(nan_is_null = TRUE)) } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index b697ecd96a0..8a12e38cc22 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -222,6 +222,15 @@ std::shared_ptr make_compute_options( cpp11::as_cpp(options["skip_nulls"])); } + if (func_name == "is_null") { + using Options = arrow::compute::NullOptions; + auto out = std::make_shared(Options::Defaults()); + if (!Rf_isNull(options["nan_is_null"])) { + out->nan_is_null = cpp11::as_cpp(options["nan_is_null"]); + } + return out; + } + if (func_name == "dictionary_encode") { using Options = arrow::compute::DictionaryEncodeOptions; auto out = std::make_shared(Options::Defaults());