Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
73 commits
Select commit Hold shift + click to select a range
a0239aa
Add NanNullOptions for IsNull kernel
Christian8491 Aug 6, 2021
bdc3453
Add kNanNullOptionsType to RegisterScalarOptions
Christian8491 Aug 6, 2021
518bd3b
Add Init param for MafeFunction and docs for is_null kernel
Christian8491 Aug 6, 2021
c98c665
Add defaults() method to NanNullOptions class
Christian8491 Aug 6, 2021
c993efc
Add implementation for IsNullOperator, Scalar case
Christian8491 Aug 6, 2021
da9fecc
Fix test compilation issue
Christian8491 Aug 6, 2021
7b6ca78
Apply clang format, add tests for isnull
Christian8491 Aug 7, 2021
e4adc7d
Improve message for is_null tests
Christian8491 Aug 7, 2021
8bdfe99
Apply requested changes for IsNullOperator
Christian8491 Aug 9, 2021
685c038
Remove default break, add todo to handle ArrayData for is_null
Christian8491 Aug 9, 2021
05b458b
Apply SetBitsTo for NaN values when passed NanNullOptions
Christian8491 Aug 10, 2021
c47838e
move kNanNullOptions to anonymous namespace
Christian8491 Aug 11, 2021
5f6e8dd
Fix for arrow-compute-expression-test (is_null)
Christian8491 Aug 11, 2021
d91d64f
Add bindings in cython
Christian8491 Aug 11, 2021
41515f1
Add specialized tests for is_null
Christian8491 Aug 12, 2021
918539f
Fix Sanitizer for KNanNullOptions
Christian8491 Aug 12, 2021
527ad78
Remove cython bindigs
Christian8491 Aug 12, 2021
7847bc3
binding in cython layer, fix python tests (theorically)
Christian8491 Aug 12, 2021
4cf4e4b
Add NanNullOptions for IsNull kernel
Christian8491 Aug 6, 2021
8b0c427
Add kNanNullOptionsType to RegisterScalarOptions
Christian8491 Aug 6, 2021
59625b3
Add Init param for MafeFunction and docs for is_null kernel
Christian8491 Aug 6, 2021
3f91336
Add defaults() method to NanNullOptions class
Christian8491 Aug 6, 2021
438a4ef
Add implementation for IsNullOperator, Scalar case
Christian8491 Aug 6, 2021
b843c0a
Fix test compilation issue
Christian8491 Aug 6, 2021
2c57c77
Apply clang format, add tests for isnull
Christian8491 Aug 7, 2021
b6716e0
Improve message for is_null tests
Christian8491 Aug 7, 2021
7e02c37
Apply requested changes for IsNullOperator
Christian8491 Aug 9, 2021
083d4a6
Remove default break, add todo to handle ArrayData for is_null
Christian8491 Aug 9, 2021
102921f
Apply SetBitsTo for NaN values when passed NanNullOptions
Christian8491 Aug 10, 2021
a9e3229
move kNanNullOptions to anonymous namespace
Christian8491 Aug 11, 2021
433a8c4
Fix for arrow-compute-expression-test (is_null)
Christian8491 Aug 11, 2021
71cfbfe
Add bindings in cython
Christian8491 Aug 11, 2021
aef53d0
Add specialized tests for is_null
Christian8491 Aug 12, 2021
57a7d3e
Fix Sanitizer for KNanNullOptions
Christian8491 Aug 12, 2021
b8582c7
Remove cython bindigs
Christian8491 Aug 12, 2021
4197bcc
binding in cython layer, fix python tests (theorically)
Christian8491 Aug 12, 2021
42943b9
reworked kernel implementation
edponce Aug 17, 2021
6ae0316
Merge branch 'ARROW-12959-Option-for-is-nullNaN-to-evaluate-to-tru' i…
Christian8491 Aug 17, 2021
557d8e7
Merge pull request #1 from edponce/ARROW-12959-R-Option-for-is-nullNa…
Christian8491 Aug 17, 2021
14bdd00
Add NanNullOptions for IsNull kernel
Christian8491 Aug 6, 2021
43219a1
Add kNanNullOptionsType to RegisterScalarOptions
Christian8491 Aug 6, 2021
98ead77
Add Init param for MafeFunction and docs for is_null kernel
Christian8491 Aug 6, 2021
f8514a8
Add defaults() method to NanNullOptions class
Christian8491 Aug 6, 2021
e5a950d
Add implementation for IsNullOperator, Scalar case
Christian8491 Aug 6, 2021
b2414e9
Fix test compilation issue
Christian8491 Aug 6, 2021
d3eb679
Apply clang format, add tests for isnull
Christian8491 Aug 7, 2021
12b7c9e
Improve message for is_null tests
Christian8491 Aug 7, 2021
8f82b98
Apply requested changes for IsNullOperator
Christian8491 Aug 9, 2021
f001068
Remove default break, add todo to handle ArrayData for is_null
Christian8491 Aug 9, 2021
c64a30a
Apply SetBitsTo for NaN values when passed NanNullOptions
Christian8491 Aug 10, 2021
2e3c8a6
move kNanNullOptions to anonymous namespace
Christian8491 Aug 11, 2021
5abfa2e
Fix for arrow-compute-expression-test (is_null)
Christian8491 Aug 11, 2021
35df5d4
Add bindings in cython
Christian8491 Aug 11, 2021
cfab892
Add specialized tests for is_null
Christian8491 Aug 12, 2021
e92318d
Fix Sanitizer for KNanNullOptions
Christian8491 Aug 12, 2021
f2b439e
Remove cython bindigs
Christian8491 Aug 12, 2021
a3707a7
binding in cython layer, fix python tests (theorically)
Christian8491 Aug 12, 2021
2a8f739
Fix conflict in scalar_validity
Christian8491 Aug 17, 2021
3028a36
remove enable_if and add NanNullState alias
edponce Aug 18, 2021
3807e98
update documentation
edponce Aug 18, 2021
8d26437
update function doc
edponce Aug 18, 2021
4ad9337
fix NanNullOptions in Python bindings
edponce Aug 18, 2021
2831063
Merge pull request #2 from edponce/ARROW-12959-R-Option-for-is-nullNa…
Christian8491 Aug 18, 2021
915c868
Requested changes for scalar_validity_test and docs
Christian8491 Aug 18, 2021
4588822
Add python test using nan_is_null
Christian8491 Aug 18, 2021
9a4f39a
Update R bindings
ianmcook Aug 20, 2021
1a9a826
Merge branch 'master' into ARROW-12959-Option-for-is-nullNaN-to-evalu…
Christian8491 Aug 23, 2021
71b85d2
Add cpp/submodules
Christian8491 Aug 25, 2021
50afc05
Merge branch 'master' into ARROW-12959-Option-for-is-nullNaN-to-evalu…
Christian8491 Aug 25, 2021
7804c49
parquet_testing sub without changes
Christian8491 Aug 25, 2021
0e47c99
Merge branch 'master' into ARROW-12959-Option-for-is-nullNaN-to-evalu…
pitrou Aug 26, 2021
30751fc
Address review comments, also restructure docs a bit
pitrou Aug 26, 2021
f09d52a
Some more test cleanup
pitrou Aug 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType<MakeStructOptions>(
static auto kDayOfWeekOptionsType = GetFunctionOptionsType<DayOfWeekOptions>(
DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering),
DataMember("week_start", &DayOfWeekOptions::week_start));
static auto kNullOptionsType = GetFunctionOptionsType<NullOptions>(
DataMember("nan_is_null", &NullOptions::nan_is_null));
} // namespace
} // namespace internal

Expand Down Expand Up @@ -291,6 +293,10 @@ DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start
week_start(week_start) {}
constexpr char DayOfWeekOptions::kTypeName[];

NullOptions::NullOptions(bool nan_is_null)
: FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {}
constexpr char NullOptions::kTypeName[];

namespace internal {
void RegisterScalarOptions(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType));
Expand All @@ -310,6 +316,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType));
}
} // namespace internal

Expand Down Expand Up @@ -463,7 +470,6 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
// Validity functions

SCALAR_EAGER_UNARY(IsValid, "is_valid")
SCALAR_EAGER_UNARY(IsNull, "is_null")
SCALAR_EAGER_UNARY(IsNan, "is_nan")

Result<Datum> FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) {
Expand All @@ -483,6 +489,10 @@ Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
return CallFunction("case_when", args, ctx);
}

Result<Datum> IsNull(const Datum& arg, NullOptions options, ExecContext* ctx) {
return CallFunction("is_null", {arg}, &options, ctx);
}

// ----------------------------------------------------------------------
// Temporal functions

Expand Down
13 changes: 12 additions & 1 deletion cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,15 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions {
int64_t start, stop, step;
};

class ARROW_EXPORT NullOptions : public FunctionOptions {
public:
explicit NullOptions(bool nan_is_null = false);
constexpr static char const kTypeName[] = "NullOptions";
static NullOptions Defaults() { return NullOptions{}; }

bool nan_is_null = false;
};

enum CompareOperator : int8_t {
EQUAL,
NOT_EQUAL,
Expand Down Expand Up @@ -756,13 +765,15 @@ Result<Datum> IsValid(const Datum& values, ExecContext* ctx = NULLPTR);
/// false otherwise
///
/// \param[in] values input to examine for nullity
/// \param[in] options NullOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> IsNull(const Datum& values, ExecContext* ctx = NULLPTR);
Result<Datum> IsNull(const Datum& values, NullOptions options = NullOptions::Defaults(),
ExecContext* ctx = NULLPTR);

/// \brief IsNan returns true for each element of `values` that is NaN,
/// false otherwise
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/compute/exec/expression.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1154,7 +1154,9 @@ Expression greater_equal(Expression lhs, Expression rhs) {
return call("greater_equal", {std::move(lhs), std::move(rhs)});
}

Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); }
Expression is_null(Expression lhs, bool nan_is_null) {
return call("is_null", {std::move(lhs)}, compute::NullOptions(std::move(nan_is_null)));
}

Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); }

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/exec/expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);

ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);

ARROW_EXPORT Expression is_null(Expression lhs);
ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false);

ARROW_EXPORT Expression is_valid(Expression lhs);

Expand Down
82 changes: 69 additions & 13 deletions cpp/src/arrow/compute/kernels/scalar_validity.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cmath>

#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"

#include "arrow/util/bit_util.h"
Expand Down Expand Up @@ -74,21 +75,71 @@ struct IsInfOperator {
}
};

using NanOptionsState = OptionsWrapper<NullOptions>;

struct IsNullOperator {
static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
checked_cast<BooleanScalar*>(out)->value = !in.is_valid;
const auto& options = NanOptionsState::Get(ctx);
bool* out_value = &checked_cast<BooleanScalar*>(out)->value;

if (in.is_valid) {
if (options.nan_is_null && is_floating(in.type->id())) {
switch (in.type->id()) {
case Type::FLOAT:
*out_value = std::isnan(internal::UnboxScalar<FloatType>::Unbox(in));
break;
case Type::DOUBLE:
*out_value = std::isnan(internal::UnboxScalar<DoubleType>::Unbox(in));
break;
default:
return Status::NotImplemented("NaN detection not implemented for type ",
in.type->ToString());
}
} else {
*out_value = false;
}
} else {
*out_value = true;
}

return Status::OK();
}

template <typename T>
static void SetNanBits(const ArrayData& arr, uint8_t* out_bitmap, int64_t out_offset) {
const T* data = arr.GetValues<T>(1);
for (int64_t i = 0; i < arr.length; ++i) {
if (std::isnan(data[i])) {
BitUtil::SetBit(out_bitmap, i + out_offset);
}
}
}

static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
if (arr.MayHaveNulls()) {
const auto& options = NanOptionsState::Get(ctx);

uint8_t* out_bitmap = out->buffers[1]->mutable_data();
if (arr.GetNullCount() > 0) {
// Input has nulls => output is the inverted null (validity) bitmap.
InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length,
out->buffers[1]->mutable_data(), out->offset);
InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, out_bitmap,
out->offset);
} else {
// Input has no nulls => output is entirely false.
BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
false);
BitUtil::SetBitsTo(out_bitmap, out->offset, out->length, false);
}

if (is_floating(arr.type->id()) && options.nan_is_null) {
switch (arr.type->id()) {
case Type::FLOAT:
SetNanBits<float>(arr, out_bitmap, out->offset);
break;
case Type::DOUBLE:
SetNanBits<double>(arr, out_bitmap, out->offset);
break;
default:
return Status::NotImplemented("NaN detection not implemented for type ",
arr.type->ToString());
}
}
return Status::OK();
}
Expand All @@ -104,11 +155,13 @@ struct IsNanOperator {
void MakeFunction(std::string name, const FunctionDoc* doc,
std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, FunctionRegistry* registry,
MemAllocation::type mem_allocation, bool can_write_into_slices) {
MemAllocation::type mem_allocation, bool can_write_into_slices,
const FunctionOptions* default_options = NULLPTR,
KernelInit init = NULLPTR) {
Arity arity{static_cast<int>(in_types.size())};
auto func = std::make_shared<ScalarFunction>(name, arity, doc);
auto func = std::make_shared<ScalarFunction>(name, arity, doc, default_options);

ScalarKernel kernel(std::move(in_types), out_type, exec);
ScalarKernel kernel(std::move(in_types), out_type, exec, init);
kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
kernel.can_write_into_slices = can_write_into_slices;
kernel.mem_allocation = mem_allocation;
Expand Down Expand Up @@ -202,9 +255,11 @@ const FunctionDoc is_inf_doc(
("For each input value, emit true iff the value is infinite (inf or -inf)."),
{"values"});

const FunctionDoc is_null_doc("Return true if null",
("For each input value, emit true iff the value is null."),
{"values"});
const FunctionDoc is_null_doc(
"Return true if null (and optionally NaN)",
("For each input value, emit true iff the value is null.\n"
"True may also be emitted for NaN values by setting the `nan_is_null` flag."),
{"values"}, "NullOptions");

const FunctionDoc is_nan_doc("Return true if NaN",
("For each input value, emit true iff the value is NaN."),
Expand All @@ -213,12 +268,13 @@ const FunctionDoc is_nan_doc("Return true if NaN",
} // namespace

void RegisterScalarValidity(FunctionRegistry* registry) {
static auto kNullOptions = NullOptions::Defaults();
MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);

MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
registry, MemAllocation::PREALLOCATE,
/*can_write_into_slices=*/true);
/*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init);

DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
Expand Down
Loading