diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index c62281b8661..84918f0e44b 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -128,6 +128,7 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti SCALAR_EAGER_UNARY(IsValid, "is_valid") SCALAR_EAGER_UNARY(IsNull, "is_null") +SCALAR_EAGER_UNARY(IsNan, "is_nan") Result FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) { return CallFunction("fill_null", {values, fill_value}, ctx); diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 789ac909ccf..1292fb7f7e9 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -345,6 +345,18 @@ Result IsValid(const Datum& values, ExecContext* ctx = NULLPTR); ARROW_EXPORT Result IsNull(const Datum& values, ExecContext* ctx = NULLPTR); +/// \brief IsNan returns true for each element of `values` that is NaN, +/// false otherwise +/// +/// \param[in] values input to look for NaN +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 3.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IsNan(const Datum& values, ExecContext* ctx = NULLPTR); + /// \brief FillNull replaces each null element in `values` /// with `fill_value` /// diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 03702316ac5..1d399f322bf 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "arrow/compute/kernels/common.h" #include "arrow/util/bit_util.h" @@ -74,6 +76,13 @@ struct IsNullOperator { } }; +struct IsNanOperator { + template + static constexpr OutType Call(KernelContext*, const InType& value) { + return std::isnan(value); + } +}; + void MakeFunction(std::string name, const FunctionDoc* doc, std::vector in_types, OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry, @@ -90,6 +99,23 @@ void MakeFunction(std::string name, const FunctionDoc* doc, DCHECK_OK(registry->AddFunction(std::move(func))); } +template +void AddIsNanKernel(const std::shared_ptr& ty, ScalarFunction* func) { + DCHECK_OK( + func->AddKernel({ty}, boolean(), + applicator::ScalarUnary::Exec)); +} + +std::shared_ptr MakeIsNanFunction(std::string name, + const FunctionDoc* doc) { + auto func = std::make_shared(name, Arity::Unary(), doc); + + AddIsNanKernel(float32(), func.get()); + AddIsNanKernel(float64(), func.get()); + + return func; +} + void IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const Datum& arg0 = batch[0]; if (arg0.type()->id() == Type::NA) { @@ -132,6 +158,10 @@ const FunctionDoc is_null_doc("Return true if null", ("For each input value, emit true iff the value is null."), {"values"}); +const FunctionDoc is_nan_doc("Return true if NaN", + ("For each input value, emit true iff the value is NaN."), + {"values"}); + } // namespace void RegisterScalarValidity(FunctionRegistry* registry) { @@ -141,6 +171,8 @@ void RegisterScalarValidity(FunctionRegistry* registry) { MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry, MemAllocation::PREALLOCATE, /*can_write_into_slices=*/true); + + DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 0c89264635e..54fa5967f7a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -31,18 +31,19 @@ namespace arrow { namespace compute { +template class TestValidityKernels : public ::testing::Test { protected: - // XXX Since IsValid and IsNull don't touch any buffers but the null bitmap - // testing multiple types seems redundant. - using ArrowType = BooleanType; - static std::shared_ptr type_singleton() { return TypeTraits::type_singleton(); } }; -TEST_F(TestValidityKernels, ArrayIsValid) { +using TestBooleanValidityKernels = TestValidityKernels; +using TestFloatValidityKernels = TestValidityKernels; +using TestDoubleValidityKernels = TestValidityKernels; + +TEST_F(TestBooleanValidityKernels, ArrayIsValid) { CheckScalarUnary("is_valid", type_singleton(), "[]", type_singleton(), "[]"); CheckScalarUnary("is_valid", type_singleton(), "[null]", type_singleton(), "[false]"); CheckScalarUnary("is_valid", type_singleton(), "[1]", type_singleton(), "[true]"); @@ -50,25 +51,25 @@ TEST_F(TestValidityKernels, ArrayIsValid) { "[false, true, true, false]"); } -TEST_F(TestValidityKernels, IsValidIsNullNullType) { +TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) { CheckScalarUnary("is_null", std::make_shared(5), ArrayFromJSON(boolean(), "[true, true, true, true, true]")); CheckScalarUnary("is_valid", std::make_shared(5), ArrayFromJSON(boolean(), "[false, false, false, false, false]")); } -TEST_F(TestValidityKernels, ArrayIsValidBufferPassthruOptimization) { +TEST_F(TestBooleanValidityKernels, ArrayIsValidBufferPassthruOptimization) { Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]"); ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg)); ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]); } -TEST_F(TestValidityKernels, ScalarIsValid) { +TEST_F(TestBooleanValidityKernels, ScalarIsValid) { CheckScalarUnary("is_valid", MakeScalar(19.7), MakeScalar(true)); CheckScalarUnary("is_valid", MakeNullScalar(float64()), MakeScalar(false)); } -TEST_F(TestValidityKernels, ArrayIsNull) { +TEST_F(TestBooleanValidityKernels, ArrayIsNull) { CheckScalarUnary("is_null", type_singleton(), "[]", type_singleton(), "[]"); CheckScalarUnary("is_null", type_singleton(), "[null]", type_singleton(), "[true]"); CheckScalarUnary("is_null", type_singleton(), "[1]", type_singleton(), "[false]"); @@ -76,16 +77,52 @@ TEST_F(TestValidityKernels, ArrayIsNull) { "[true, false, false, true]"); } -TEST_F(TestValidityKernels, IsNullSetsZeroNullCount) { +TEST_F(TestBooleanValidityKernels, IsNullSetsZeroNullCount) { auto arr = ArrayFromJSON(int32(), "[1, 2, 3, 4]"); std::shared_ptr result = (*IsNull(arr)).array(); ASSERT_EQ(result->null_count, 0); } -TEST_F(TestValidityKernels, ScalarIsNull) { +TEST_F(TestBooleanValidityKernels, ScalarIsNull) { CheckScalarUnary("is_null", MakeScalar(19.7), MakeScalar(false)); CheckScalarUnary("is_null", MakeNullScalar(float64()), MakeScalar(true)); } +TEST_F(TestFloatValidityKernels, FloatArrayIsNan) { + // All NaN + CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + // No NaN + CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, 1.0, 2.0, 3.0, 4.0, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); + // Some NaNs + CheckScalarUnary("is_nan", ArrayFromJSON(float32(), "[0.0, NaN, 2.0, NaN, 4.0, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); +} + +TEST_F(TestDoubleValidityKernels, DoubleArrayIsNan) { + // All NaN + CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[NaN, NaN, NaN, NaN, NaN]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]")); + // No NaN + CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, 3.0, 4.0, null]"), + ArrayFromJSON(boolean(), "[false, false, false, false, false, null]")); + // Some NaNs + CheckScalarUnary("is_nan", ArrayFromJSON(float64(), "[0.0, NaN, 2.0, NaN, 4.0, null]"), + ArrayFromJSON(boolean(), "[false, true, false, true, false, null]")); +} + +TEST_F(TestFloatValidityKernels, FloatScalarIsNan) { + CheckScalarUnary("is_nan", MakeNullScalar(float32()), MakeNullScalar(boolean())); + CheckScalarUnary("is_nan", MakeScalar(42.0f), MakeScalar(false)); + CheckScalarUnary("is_nan", MakeScalar(std::nanf("")), MakeScalar(true)); +} + +TEST_F(TestDoubleValidityKernels, DoubleScalarIsNan) { + CheckScalarUnary("is_nan", MakeNullScalar(float64()), MakeNullScalar(boolean())); + CheckScalarUnary("is_nan", MakeScalar(42.0), MakeScalar(false)); + CheckScalarUnary("is_nan", MakeScalar(std::nan("")), MakeScalar(true)); +} + } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index d1c5dd28af2..472859708f0 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -453,22 +453,26 @@ Structural transforms +==========================+============+================================================+=====================+=========+ | fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(1) | +--------------------------+------------+------------------------------------------------+---------------------+---------+ -| is_null | Unary | Any | Boolean | \(2) | +| is_nan | Unary | Float, Double | Boolean | \(2) | +--------------------------+------------+------------------------------------------------+---------------------+---------+ -| is_valid | Unary | Any | Boolean | \(2) | +| is_null | Unary | Any | Boolean | \(3) | +--------------------------+------------+------------------------------------------------+---------------------+---------+ -| list_value_length | Unary | List-like | Int32 or Int64 | \(4) | +| is_valid | Unary | Any | Boolean | \(4) | ++--------------------------+------------+------------------------------------------------+---------------------+---------+ +| list_value_length | Unary | List-like | Int32 or Int64 | \(5) | +--------------------------+------------+------------------------------------------------+---------------------+---------+ * \(1) First input must be an array, second input a scalar of the same type. Output is an array of the same type as the inputs, and with the same values as the first input, except for nulls replaced with the second input value. -* \(2) Output is true iff the corresponding input element is non-null. +* \(2) Output is true iff the corresponding input element is NaN. * \(3) Output is true iff the corresponding input element is null. -* \(4) Each output element is the length of the corresponding input element +* \(4) Output is true iff the corresponding input element is non-null. + +* \(5) Each output element is the length of the corresponding input element (null if input is null). Output type is Int32 for List, Int64 for LargeList. Conversions