Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1311,7 +1311,7 @@ TEST_F(TestFWBinaryArray, ZeroSize) {
const auto& fw_array = checked_cast<const FixedSizeBinaryArray&>(*array);

// data is never allocated
ASSERT_TRUE(fw_array.values() == nullptr);
ASSERT_EQ(fw_array.values()->size(), 0);
ASSERT_EQ(0, fw_array.byte_width());

ASSERT_EQ(6, array->length());
Expand Down
13 changes: 11 additions & 2 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -297,12 +297,21 @@ MapArray::MapArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }

MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& offsets,
const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
SetData(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()},
null_count, offset));
}

MapArray::MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& offsets,
const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& items,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
auto pair_data = ArrayData::Make(type->children()[0]->type(), keys->data()->length,
{nullptr}, {keys->data(), values->data()}, 0, offset);
{nullptr}, {keys->data(), items->data()}, 0, offset);
auto map_data = ArrayData::Make(type, length, {null_bitmap, offsets}, {pair_data},
null_count, offset);
SetData(map_data);
Expand Down
8 changes: 7 additions & 1 deletion cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,13 @@ class ARROW_EXPORT MapArray : public ListArray {

MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& values,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

MapArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/builder_primitive.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ Status BooleanBuilder::Resize(int64_t capacity) {
}

Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> data, null_bitmap;
RETURN_NOT_OK(data_builder_.Finish(&data));
std::shared_ptr<Buffer> null_bitmap, data;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(data_builder_.Finish(&data));

*out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_);

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/buffer-builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ class ARROW_EXPORT BufferBuilder {
ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
if (size_ != 0) buffer_->ZeroPadding();
*out = buffer_;
if (*out == NULLPTR) {
ARROW_RETURN_NOT_OK(AllocateBuffer(pool_, 0, out));
}
Reset();
return Status::OK();
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ add_arrow_benchmark(compare-benchmark PREFIX "arrow-compute")
add_arrow_test(take-test PREFIX "arrow-compute")
add_arrow_test(filter-test PREFIX "arrow-compute")
add_arrow_benchmark(filter-benchmark PREFIX "arrow-compute")
add_arrow_benchmark(take-benchmark PREFIX "arrow-compute")
31 changes: 31 additions & 0 deletions cpp/src/arrow/compute/kernels/filter-benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,30 @@ static void FilterFixedSizeList1Int64(benchmark::State& state) {
}
}

static void FilterString(benchmark::State& state) {
RegressionArgs args(state);

int32_t string_min_length = 0, string_max_length = 128;
int32_t string_mean_length = (string_max_length + string_min_length) / 2;
// for an array of 50% null strings, we need to generate twice as many strings
// to ensure that they have an average of args.size total characters
auto array_size =
static_cast<int64_t>(args.size / string_mean_length / (1 - args.null_proportion));

auto rand = random::RandomArrayGenerator(kSeed);
auto array = std::static_pointer_cast<StringArray>(rand.String(
array_size, string_min_length, string_max_length, args.null_proportion));
auto filter = std::static_pointer_cast<BooleanArray>(
rand.Boolean(array_size, 0.75, args.null_proportion));

FunctionContext ctx;
for (auto _ : state) {
Datum out;
ABORT_NOT_OK(Filter(&ctx, Datum(array), Datum(filter), &out));
benchmark::DoNotOptimize(out);
}
}

BENCHMARK(FilterInt64)
->Apply(RegressionSetArgs)
->Args({1 << 20, 1})
Expand All @@ -82,5 +106,12 @@ BENCHMARK(FilterFixedSizeList1Int64)
->MinTime(1.0)
->Unit(benchmark::TimeUnit::kNanosecond);

BENCHMARK(FilterString)
->Apply(RegressionSetArgs)
->Args({1 << 20, 1})
->Args({1 << 23, 1})
->MinTime(1.0)
->Unit(benchmark::TimeUnit::kNanosecond);

} // namespace compute
} // namespace arrow
57 changes: 51 additions & 6 deletions cpp/src/arrow/compute/kernels/filter-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ namespace compute {
using internal::checked_pointer_cast;
using util::string_view;

constexpr auto kSeed = 0x0ff1ce;

template <typename ArrowType>
class TestFilterKernel : public ComputeFixture, public TestBase {
protected:
Expand All @@ -42,23 +44,29 @@ class TestFilterKernel : public ComputeFixture, public TestBase {
const std::shared_ptr<Array>& expected) {
std::shared_ptr<Array> actual;
ASSERT_OK(arrow::compute::Filter(&this->ctx_, *values, *filter, &actual));
ASSERT_OK(ValidateArray(*actual));
AssertArraysEqual(*expected, *actual);
}

void AssertFilter(const std::shared_ptr<DataType>& type, const std::string& values,
const std::string& filter, const std::string& expected) {
std::shared_ptr<Array> actual;
ASSERT_OK(this->Filter(type, values, filter, &actual));
ASSERT_OK(ValidateArray(*actual));
AssertArraysEqual(*ArrayFromJSON(type, expected), *actual);
}

Status Filter(const std::shared_ptr<DataType>& type, const std::string& values,
const std::string& filter, std::shared_ptr<Array>* out) {
return arrow::compute::Filter(&this->ctx_, *ArrayFromJSON(type, values),
*ArrayFromJSON(boolean(), filter), out);
}

void ValidateFilter(const std::shared_ptr<Array>& values,
const std::shared_ptr<Array>& filter_boxed) {
std::shared_ptr<Array> filtered;
ASSERT_OK(arrow::compute::Filter(&this->ctx_, *values, *filter_boxed, &filtered));
ASSERT_OK(ValidateArray(*filtered));

auto filter = checked_pointer_cast<BooleanArray>(filter_boxed);
int64_t values_i = 0, filtered_i = 0;
Expand All @@ -84,11 +92,13 @@ class TestFilterKernelWithNull : public TestFilterKernel<NullType> {
protected:
void AssertFilter(const std::string& values, const std::string& filter,
const std::string& expected) {
TestFilterKernel<NullType>::AssertFilter(utf8(), values, filter, expected);
TestFilterKernel<NullType>::AssertFilter(null(), values, filter, expected);
}
};

TEST_F(TestFilterKernelWithNull, FilterNull) {
this->AssertFilter("[]", "[]", "[]");

this->AssertFilter("[null, null, null]", "[0, 1, 0]", "[null]");
this->AssertFilter("[null, null, null]", "[1, 1, 0]", "[null, null]");
}
Expand All @@ -102,6 +112,8 @@ class TestFilterKernelWithBoolean : public TestFilterKernel<BooleanType> {
};

TEST_F(TestFilterKernelWithBoolean, FilterBoolean) {
this->AssertFilter("[]", "[]", "[]");

this->AssertFilter("[true, false, true]", "[0, 1, 0]", "[false]");
this->AssertFilter("[null, false, true]", "[0, 1, 0]", "[false]");
this->AssertFilter("[true, false, true]", "[null, 1, 0]", "[null, false]");
Expand All @@ -114,6 +126,7 @@ class TestFilterKernelWithNumeric : public TestFilterKernel<ArrowType> {
const std::string& expected) {
TestFilterKernel<ArrowType>::AssertFilter(type_singleton(), values, filter, expected);
}

std::shared_ptr<DataType> type_singleton() {
return TypeTraits<ArrowType>::type_singleton();
}
Expand All @@ -135,13 +148,16 @@ TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) {
this->AssertFilter("[null, 8, 9]", "[0, 1, 0]", "[8]");
this->AssertFilter("[7, 8, 9]", "[null, 1, 0]", "[null, 8]");
this->AssertFilter("[7, 8, 9]", "[1, null, 1]", "[7, null, 9]");

std::shared_ptr<Array> arr;
ASSERT_RAISES(Invalid, this->Filter(this->type_singleton(), "[7, 8, 9]", "[]", &arr));
}

TYPED_TEST(TestFilterKernelWithNumeric, FilterRandomNumeric) {
auto rand = random::RandomArrayGenerator(0x5416447);
auto rand = random::RandomArrayGenerator(kSeed);
for (size_t i = 3; i < 13; i++) {
const int64_t length = static_cast<int64_t>(1ULL << i);
for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) {
for (auto null_probability : {0.0, 0.01, 0.25, 1.0}) {
for (auto filter_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) {
auto values = rand.Numeric<TypeParam>(length, 0, 127, null_probability);
auto filter = rand.Boolean(length, filter_probability, null_probability);
Expand Down Expand Up @@ -191,7 +207,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
using ArrayType = typename TypeTraits<TypeParam>::ArrayType;
using CType = typename TypeTraits<TypeParam>::CType;

auto rand = random::RandomArrayGenerator(0x5416447);
auto rand = random::RandomArrayGenerator(kSeed);
for (size_t i = 3; i < 13; i++) {
const int64_t length = static_cast<int64_t>(1ULL << i);
// TODO(bkietz) rewrite with some nulls
Expand All @@ -206,6 +222,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
&selection));
ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered));
auto filtered_array = filtered.make_array();
ASSERT_OK(ValidateArray(*filtered_array));
auto expected =
CompareAndFilter<TypeParam>(array->raw_values(), array->length(), c_fifty, op);
ASSERT_ARRAYS_EQUAL(*filtered_array, *expected);
Expand All @@ -216,7 +233,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) {
using ArrayType = typename TypeTraits<TypeParam>::ArrayType;

auto rand = random::RandomArrayGenerator(0x5416447);
auto rand = random::RandomArrayGenerator(kSeed);
for (size_t i = 3; i < 13; i++) {
const int64_t length = static_cast<int64_t>(1ULL << i);
auto lhs =
Expand All @@ -230,6 +247,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) {
&selection));
ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(lhs), selection, &filtered));
auto filtered_array = filtered.make_array();
ASSERT_OK(ValidateArray(*filtered_array));
auto expected = CompareAndFilter<TypeParam>(lhs->raw_values(), lhs->length(),
rhs->raw_values(), op);
ASSERT_ARRAYS_EQUAL(*filtered_array, *expected);
Expand All @@ -242,7 +260,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
using ArrayType = typename TypeTraits<TypeParam>::ArrayType;
using CType = typename TypeTraits<TypeParam>::CType;

auto rand = random::RandomArrayGenerator(0x5416447);
auto rand = random::RandomArrayGenerator(kSeed);
for (size_t i = 3; i < 13; i++) {
const int64_t length = static_cast<int64_t>(1ULL << i);
auto array =
Expand All @@ -259,6 +277,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
&selection));
ASSERT_OK(arrow::compute::Filter(&this->ctx_, Datum(array), selection, &filtered));
auto filtered_array = filtered.make_array();
ASSERT_OK(ValidateArray(*filtered_array));
auto expected = CompareAndFilter<TypeParam>(
array->raw_values(), array->length(),
[&](CType e) { return (e > c_fifty) && (e < c_hundred); });
Expand Down Expand Up @@ -313,6 +332,32 @@ TEST_F(TestFilterKernelWithList, FilterListInt32) {
this->AssertFilter(list(int32()), list_json, "[0, 1, 0, 1]", "[[1,2], [3]]");
}

TEST_F(TestFilterKernelWithList, FilterListListInt32) {
std::string list_json = R"([
[],
[[1], [2, null, 2], []],
null,
[[3, null], null]
])";
auto type = list(list(int32()));
this->AssertFilter(type, list_json, "[0, 0, 0, 0]", "[]");
this->AssertFilter(type, list_json, "[0, 1, 1, null]", R"([
[[1], [2, null, 2], []],
null,
null
])");
this->AssertFilter(type, list_json, "[0, 0, 1, null]", "[null, null]");
this->AssertFilter(type, list_json, "[1, 0, 0, 1]", R"([
[],
[[3, null], null]
])");
this->AssertFilter(type, list_json, "[1, 1, 1, 1]", list_json);
this->AssertFilter(type, list_json, "[0, 1, 0, 1]", R"([
[[1], [2, null, 2], []],
[[3, null], null]
])");
}

class TestFilterKernelWithFixedSizeList : public TestFilterKernel<FixedSizeListType> {};

TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) {
Expand Down
Loading