Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions c_glib/test/test-is-in.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@ def test_no_null
def test_null_in_left
left = build_int16_array([1, 0, nil, 2])
right = build_int16_array([2, 0, 3])
assert_equal(build_boolean_array([false, true, nil, true]),
assert_equal(build_boolean_array([false, true, false, true]),
left.is_in(right))
end

def test_null_in_right
left = build_int16_array([1, 0, 1, 2])
right = build_int16_array([2, 0, nil, 2, 0])
right = build_int16_array([2, 0, nil])
assert_equal(build_boolean_array([false, true, false, true]),
left.is_in(right))
end

def test_null_in_both
left = build_int16_array([1, 0, nil, 2])
right = build_int16_array([2, 0, nil, 2, 0, nil])
right = build_int16_array([2, 0, nil])
assert_equal(build_boolean_array([false, true, true, true]),
left.is_in(right))
end
Expand All @@ -52,8 +52,8 @@ def test_null_in_both
def test_no_null
left = build_int16_array([1, 0, 1, 2])
chunks = [
build_int16_array([1, 0]),
build_int16_array([1, 0, 3])
build_int16_array([1, 4]),
build_int16_array([3, 0])
]
right = Arrow::ChunkedArray.new(chunks)
assert_equal(build_boolean_array([true, true, true, false]),
Expand All @@ -63,19 +63,19 @@ def test_no_null
def test_null_in_left
left = build_int16_array([1, 0, nil, 2])
chunks = [
build_int16_array([2, 0, 3]),
build_int16_array([3, 0, 2, 2])
build_int16_array([2, 0]),
build_int16_array([3, 4])
]
right = Arrow::ChunkedArray.new(chunks)
assert_equal(build_boolean_array([false, true, nil, true]),
assert_equal(build_boolean_array([false, true, false, true]),
left.is_in_chunked_array(right))
end

def test_null_in_right
left = build_int16_array([1, 0, 1, 2])
chunks = [
build_int16_array([2, 0, nil, 2, 0]),
build_int16_array([2, 3, nil])
build_int16_array([2, 0]),
build_int16_array([3, nil])
]
right = Arrow::ChunkedArray.new(chunks)
assert_equal(build_boolean_array([false, true, false, true]),
Expand All @@ -85,8 +85,8 @@ def test_null_in_right
def test_null_in_both
left = build_int16_array([1, 0, nil, 2])
chunks = [
build_int16_array([2, 0, nil, 2, 0, nil]),
build_int16_array([2, 3, nil])
build_int16_array([2, 0]),
build_int16_array([3, nil])
]
right = Arrow::ChunkedArray.new(chunks)
assert_equal(build_boolean_array([false, true, true, true]),
Expand Down
26 changes: 16 additions & 10 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,30 +57,36 @@ SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked")
// Set-related operations

static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
const Datum& value_set, bool add_nulls_to_hash_table,
ExecContext* ctx) {
if (!value_set.is_arraylike()) {
const SetLookupOptions& options, ExecContext* ctx) {
if (!options.value_set.is_arraylike()) {
return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
}

if (value_set.length() > 0 && !data.type()->Equals(value_set.type())) {
if (options.value_set.length() > 0 && !data.type()->Equals(options.value_set.type())) {
std::stringstream ss;
ss << "Array type didn't match type of values set: " << data.type()->ToString()
<< " vs " << value_set.type()->ToString();
<< " vs " << options.value_set.type()->ToString();
return Status::Invalid(ss.str());
}
SetLookupOptions options(value_set, !add_nulls_to_hash_table);
return CallFunction(func_name, {data}, &options, ctx);
}

Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
ExecContext* ctx) {
return ExecSetLookup("is_in", values, options, ctx);
}

Result<Datum> IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
return ExecSetLookup("is_in", values, value_set,
/*add_nulls_to_hash_table=*/false, ctx);
return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx);
}

Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
ExecContext* ctx) {
return ExecSetLookup("index_in", values, options, ctx);
}

Result<Datum> IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
return ExecSetLookup("index_in", values, value_set,
/*add_nulls_to_hash_table=*/true, ctx);
return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx);
}

// ----------------------------------------------------------------------
Expand Down
18 changes: 10 additions & 8 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,17 +290,19 @@ Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
/// \brief IsIn returns true for each element of `values` that is contained in
/// `value_set`
///
/// If null occurs in left, if null count in right is not 0,
/// it returns true, else returns null.
/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could mention here that the default is skip_nulls=False (or in the docstring of SetLookupOptions)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, could maybe add a similar sentence "Behaviour of nulls is governed by SetLookupOptions::skip_nulls" to the is_in_doc ?

///
/// \param[in] values array-like input to look up in value_set
/// \param[in] value_set either Array or ChunkedArray
/// \param[in] options SetLookupOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
ExecContext* ctx = NULLPTR);
ARROW_EXPORT
Result<Datum> IsIn(const Datum& values, const Datum& value_set,
ExecContext* ctx = NULLPTR);

Expand All @@ -312,19 +314,19 @@ Result<Datum> IsIn(const Datum& values, const Datum& value_set,
/// For example given values = [99, 42, 3, null] and
/// value_set = [3, 3, 99], the output will be = [1, null, 0, null]
///
/// Note: Null in the values is considered to match
/// a null in the value_set array. For example given
/// values = [99, 42, 3, null] and value_set = [3, 99, null],
/// the output will be = [1, null, 0, 2]
/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
///
/// \param[in] values array-like input
/// \param[in] value_set either Array or ChunkedArray
/// \param[in] options SetLookupOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
ExecContext* ctx = NULLPTR);
ARROW_EXPORT
Result<Datum> IndexIn(const Datum& values, const Datum& value_set,
ExecContext* ctx = NULLPTR);

Expand Down
111 changes: 60 additions & 51 deletions cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,53 +36,59 @@ namespace {

template <typename Type>
struct SetLookupState : public KernelState {
explicit SetLookupState(MemoryPool* pool)
: lookup_table(pool, 0), lookup_null_count(0) {}
explicit SetLookupState(const SetLookupOptions& options, MemoryPool* pool)
: options(options), lookup_table(pool, 0) {}

Status Init(const SetLookupOptions& options) {
Status Init() {
if (options.value_set.kind() == Datum::ARRAY) {
RETURN_NOT_OK(AddArrayValueSet(*options.value_set.array()));
} else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) {
const ChunkedArray& value_set = *options.value_set.chunked_array();
for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a lot of code like this, maybe later we should add Datum::chunks() so we can write

if (!options.values_set.is_arraylike()) {
  return Status::Invalid("value_set should be an array or chunked array");
}
for (const std::shared_ptr<ArrayData>& chunk : options.value_set.chunks()) {
  RETURN_NOT_OK(AddArrayValueSet(*chunk->data()));
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're not too bothered by the cost of a temporary vector then it may be nice indeed.

RETURN_NOT_OK(AddArrayValueSet(*chunk->data()));
}
} else {
return Status::Invalid("value_set should be an array or chunked array");
}
if (lookup_table.size() != options.value_set.length()) {
return Status::NotImplemented("duplicate values in value_set");
}
value_set_has_null = (lookup_table.GetNull() >= 0);
return Status::OK();
}

Status AddArrayValueSet(const ArrayData& data) {
using T = typename GetViewType<Type>::T;
auto visit_valid = [&](T v) {
int32_t unused_memo_index;
return lookup_table.GetOrInsert(v, &unused_memo_index);
};
auto visit_null = [&]() {
if (!options.skip_nulls) {
lookup_table.GetOrInsertNull();
}
lookup_table.GetOrInsertNull();
return Status::OK();
};
if (options.value_set.kind() == Datum::ARRAY) {
const std::shared_ptr<ArrayData>& value_set = options.value_set.array();
this->lookup_null_count += value_set->GetNullCount();
return VisitArrayDataInline<Type>(*value_set, std::move(visit_valid),
std::move(visit_null));
} else {
const ChunkedArray& value_set = *options.value_set.chunked_array();
for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
this->lookup_null_count += chunk->null_count();
RETURN_NOT_OK(VisitArrayDataInline<Type>(*chunk->data(), std::move(visit_valid),
std::move(visit_null)));
}
return Status::OK();
}

return VisitArrayDataInline<Type>(data, visit_valid, visit_null);
}

using MemoTable = typename HashTraits<Type>::MemoTableType;
const SetLookupOptions& options;
MemoTable lookup_table;
int64_t lookup_null_count;
int64_t null_index = -1;
bool value_set_has_null;
};

template <>
struct SetLookupState<NullType> : public KernelState {
explicit SetLookupState(MemoryPool*) {}
explicit SetLookupState(const SetLookupOptions& options, MemoryPool*)
: options(options) {}

Status Init(const SetLookupOptions& options) {
this->lookup_null_count = options.value_set.null_count();
Status Init() {
this->value_set_has_null = (options.value_set.length() > 0);
return Status::OK();
}

int64_t lookup_null_count;
const SetLookupOptions& options;
bool value_set_has_null;
};

// TODO: Put this concept somewhere reusable
Expand Down Expand Up @@ -125,8 +131,8 @@ struct InitStateVisitor {
"Attempted to call a set lookup function without SetLookupOptions");
}
using StateType = SetLookupState<Type>;
result.reset(new StateType(ctx->exec_context()->memory_pool()));
return static_cast<StateType*>(result.get())->Init(*options);
result.reset(new StateType(*options, ctx->exec_context()->memory_pool()));
return static_cast<StateType*>(result.get())->Init();
}

Status Visit(const DataType&) { return Init<NullType>(); }
Expand Down Expand Up @@ -174,16 +180,18 @@ struct IndexInVisitor {
IndexInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
: ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {}

Status Visit(const DataType&) {
Status Visit(const DataType& type) {
DCHECK_EQ(type.id(), Type::NA);
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
if (data.length != 0) {
if (state.lookup_null_count == 0) {
RETURN_NOT_OK(this->builder.AppendNulls(data.length));
} else {
// skip_nulls is honored for consistency with other types
if (state.value_set_has_null && !state.options.skip_nulls) {
RETURN_NOT_OK(this->builder.Reserve(data.length));
for (int64_t i = 0; i < data.length; ++i) {
this->builder.UnsafeAppend(0);
}
} else {
RETURN_NOT_OK(this->builder.AppendNulls(data.length));
}
}
return Status::OK();
Expand All @@ -195,7 +203,7 @@ struct IndexInVisitor {

const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());

int32_t null_index = state.lookup_table.GetNull();
int32_t null_index = state.options.skip_nulls ? -1 : state.lookup_table.GetNull();
RETURN_NOT_OK(this->builder.Reserve(data.length));
VisitArrayDataInline<Type>(
data,
Expand Down Expand Up @@ -261,7 +269,7 @@ void ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {

// ----------------------------------------------------------------------

// IsIn writes the results into a preallocated binary data bitmap
// IsIn writes the results into a preallocated boolean data bitmap
struct IsInVisitor {
KernelContext* ctx;
const ArrayData& data;
Expand All @@ -270,12 +278,12 @@ struct IsInVisitor {
IsInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
: ctx(ctx), data(data), out(out) {}

Status Visit(const DataType&) {
Status Visit(const DataType& type) {
DCHECK_EQ(type.id(), Type::NA);
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
ArrayData* output = out->mutable_array();
if (state.lookup_null_count > 0) {
BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset,
output->length, true);
// skip_nulls is honored for consistency with other types
if (state.value_set_has_null && !state.options.skip_nulls) {
BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset,
output->length, true);
} else {
Expand All @@ -291,13 +299,6 @@ struct IsInVisitor {
const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
ArrayData* output = out->mutable_array();

if (this->data.GetNullCount() > 0 && state.lookup_null_count > 0) {
// If there were nulls in the value set, set the whole validity bitmap to
// true
output->null_count = 0;
BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset,
output->length, true);
}
FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
output->length);
VisitArrayDataInline<Type>(
Expand All @@ -311,7 +312,11 @@ struct IsInVisitor {
writer.Next();
},
[&]() {
writer.Set();
if (!state.options.skip_nulls && state.lookup_table.GetNull() != -1) {
writer.Set();
} else {
writer.Clear();
}
writer.Next();
});
writer.Finish();
Expand Down Expand Up @@ -412,33 +417,37 @@ class IndexInMetaBinary : public MetaFunction {
const FunctionDoc is_in_doc{
"Find each element in a set of values",
("For each element in `values`, return true if it is found in a given\n"
"set of values. The set of values to look for must be given in\n"
"SetLookupOptions."),
"set of values, false otherwise.\n"
"The set of values to look for must be given in SetLookupOptions.\n"
"By default, nulls are matched against the value set, this can be\n"
"changed in SetLookupOptions."),
{"values"},
"SetLookupOptions"};

const FunctionDoc index_in_doc{
"Return index of each element in a set of values",
("For each element in `values`, return its index in a given set of\n"
"values, or null if it is not found there.\n"
"The set of values to look for must be given in SetLookupOptions."),
"The set of values to look for must be given in SetLookupOptions.\n"
"By default, nulls are matched against the value set, this can be\n"
"changed in SetLookupOptions."),
{"values"},
"SetLookupOptions"};

} // namespace

void RegisterScalarSetLookup(FunctionRegistry* registry) {
// IsIn always writes into preallocated memory
// IsIn writes its boolean output into preallocated memory
{
ScalarKernel isin_base;
isin_base.init = InitSetLookup;
isin_base.exec = TrivialScalarUnaryAsArraysExec(ExecIsIn);
isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
auto is_in = std::make_shared<ScalarFunction>("is_in", Arity::Unary(), &is_in_doc);

AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get());

isin_base.signature = KernelSignature::Make({null()}, boolean());
isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE;
DCHECK_OK(is_in->AddKernel(isin_base));
DCHECK_OK(registry->AddFunction(is_in));

Expand Down
Loading