Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,22 @@ void TestPrimitiveBuilder<PBoolean>::Check(const std::unique_ptr<BooleanBuilder>
ASSERT_EQ(0, builder->null_count());
}

TEST(NumericBuilderAccessors, TestSettersGetters) {
int64_t datum = 42;
int64_t new_datum = 43;
NumericBuilder<Int64Type> builder(int64(), default_memory_pool());

builder.Reset();
ASSERT_OK(builder.Append(datum));
ASSERT_EQ(builder.GetValue(0), datum);

// Now update the value.
builder[0] = new_datum;

ASSERT_EQ(builder.GetValue(0), new_datum);
ASSERT_EQ(((const NumericBuilder<Int64Type>&)builder)[0], new_datum);
}

typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PInt16,
PInt32, PInt64, PFloat, PDouble>
Primitives;
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/builder_primitive.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ class NumericBuilder : public ArrayBuilder {
return ArrayBuilder::Resize(capacity);
}

value_type operator[](int64_t index) const { return GetValue(index); }

value_type& operator[](int64_t index) {
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
}

/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous C array of values
/// \param[in] length the number of values to append
Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/compute/kernels/boolean-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ TEST_F(TestBooleanKernel, Invert) {
}

TEST_F(TestBooleanKernel, InvertEmptyArray) {
auto type = boolean();
std::vector<std::shared_ptr<Buffer>> data_buffers(2);
Datum input;
input.value = ArrayData::Make(boolean(), 0 /* length */, std::move(data_buffers),
Expand Down
135 changes: 134 additions & 1 deletion cpp/src/arrow/compute/kernels/hash-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
#include "arrow/compute/kernels/util-internal.h"
#include "arrow/compute/test-util.h"

#include "arrow/ipc/json-simple.h"

using std::shared_ptr;
using std::vector;

Expand All @@ -61,9 +63,47 @@ void CheckUnique(FunctionContext* ctx, const shared_ptr<DataType>& type,

shared_ptr<Array> result;
ASSERT_OK(Unique(ctx, input, &result));
// TODO: We probably shouldn't rely on array ordering.
ASSERT_ARRAYS_EQUAL(*expected, *result);
}

template <typename Type, typename T>
void CheckValueCountsNull(FunctionContext* ctx, const shared_ptr<DataType>& type) {
std::vector<std::shared_ptr<Buffer>> data_buffers(2);
Datum input;
input.value =
ArrayData::Make(type, 0 /* length */, std::move(data_buffers), 0 /* null_count */);

shared_ptr<Array> ex_values = ArrayFromJSON(type, "[]");
shared_ptr<Array> ex_counts = ArrayFromJSON(int64(), "[]");

shared_ptr<Array> result;
ASSERT_OK(ValueCounts(ctx, input, &result));
auto result_struct = std::dynamic_pointer_cast<StructArray>(result);
ASSERT_NE(result_struct->GetFieldByName(kValuesFieldName), nullptr);
// TODO: We probably shouldn't rely on value ordering.
ASSERT_ARRAYS_EQUAL(*ex_values, *result_struct->GetFieldByName(kValuesFieldName));
ASSERT_ARRAYS_EQUAL(*ex_counts, *result_struct->GetFieldByName(kCountsFieldName));
}

template <typename Type, typename T>
void CheckValueCounts(FunctionContext* ctx, const shared_ptr<DataType>& type,
const vector<T>& in_values, const vector<bool>& in_is_valid,
const vector<T>& out_values, const vector<bool>& out_is_valid,
const vector<int64_t>& out_counts) {
shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid);
shared_ptr<Array> ex_values = _MakeArray<Type, T>(type, out_values, out_is_valid);
shared_ptr<Array> ex_counts =
_MakeArray<Int64Type, int64_t>(int64(), out_counts, out_is_valid);

shared_ptr<Array> result;
ASSERT_OK(ValueCounts(ctx, input, &result));
auto result_struct = std::dynamic_pointer_cast<StructArray>(result);
// TODO: We probably shouldn't rely on value ordering.
ASSERT_ARRAYS_EQUAL(*ex_values, *result_struct->field(kValuesFieldIndex));
ASSERT_ARRAYS_EQUAL(*ex_counts, *result_struct->field(kCountsFieldIndex));
}

template <typename Type, typename T>
void CheckDictEncode(FunctionContext* ctx, const shared_ptr<DataType>& type,
const vector<T>& in_values, const vector<bool>& in_is_valid,
Expand Down Expand Up @@ -104,6 +144,16 @@ TYPED_TEST(TestHashKernelPrimitive, Unique) {
{3, 1}, {});
}

TYPED_TEST(TestHashKernelPrimitive, ValueCounts) {
using T = typename TypeParam::c_type;
auto type = TypeTraits<TypeParam>::type_singleton();
CheckValueCounts<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 1, 2, 3, 4},
{true, false, true, true, true, true, false}, {2, 1, 3},
{}, {3, 1, 1});
CheckValueCounts<TypeParam, T>(&this->ctx_, type, {}, {}, {}, {}, {});
CheckValueCountsNull<TypeParam, T>(&this->ctx_, type);
}

TYPED_TEST(TestHashKernelPrimitive, DictEncode) {
using T = typename TypeParam::c_type;
auto type = TypeTraits<TypeParam>::type_singleton();
Expand All @@ -121,19 +171,21 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
vector<T> values;
vector<T> uniques;
vector<int32_t> indices;
vector<int64_t> counts;
for (int64_t i = 0; i < kTotalValues * kRepeats; i++) {
const auto val = static_cast<T>(i % kTotalValues);
values.push_back(val);

if (i < kTotalValues) {
uniques.push_back(val);
counts.push_back(kRepeats);
}
indices.push_back(static_cast<int32_t>(i % kTotalValues));
}

auto type = TypeTraits<TypeParam>::type_singleton();
CheckUnique<TypeParam, T>(&this->ctx_, type, values, {}, uniques, {});

CheckValueCounts<TypeParam, T>(&this->ctx_, type, values, {}, uniques, {}, counts);
CheckDictEncode<TypeParam, T>(&this->ctx_, type, values, {}, uniques, {}, indices);
}

Expand All @@ -149,6 +201,19 @@ TEST_F(TestHashKernel, UniqueTimeTimestamp) {
{});
}

TEST_F(TestHashKernel, ValueCountsTimeTimestamp) {
CheckValueCounts<Time32Type, int32_t>(&this->ctx_, time32(TimeUnit::SECOND),
{2, 1, 2, 1}, {true, false, true, true}, {2, 1},
{}, {2, 1});

CheckValueCounts<Time64Type, int64_t>(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1},
{true, false, true, true}, {2, 1}, {}, {2, 1});

CheckValueCounts<TimestampType, int64_t>(&this->ctx_, timestamp(TimeUnit::NANO),
{2, 1, 2, 1}, {true, false, true, true},
{2, 1}, {}, {2, 1});
}

TEST_F(TestHashKernel, UniqueBoolean) {
CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true},
{true, false, true, true}, {true, false}, {});
Expand All @@ -164,6 +229,23 @@ TEST_F(TestHashKernel, UniqueBoolean) {
{false, true}, {});
}

TEST_F(TestHashKernel, ValueCountsBoolean) {
CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true},
{true, false, true, true}, {true, false}, {},
{2, 1});

CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true},
{true, false, true, true}, {false, true}, {},
{2, 1});

// No nulls
CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true},
{}, {true, false}, {}, {3, 1});

CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true},
{}, {false, true}, {}, {2, 2});
}

TEST_F(TestHashKernel, DictEncodeBoolean) {
CheckDictEncode<BooleanType, bool>(
&this->ctx_, boolean(), {true, true, false, true, false},
Expand Down Expand Up @@ -192,6 +274,16 @@ TEST_F(TestHashKernel, UniqueBinary) {
{true, false, true, true}, {"test", "test2"}, {});
}

TEST_F(TestHashKernel, ValueCountsBinary) {
CheckValueCounts<BinaryType, std::string>(
&this->ctx_, binary(), {"test", "", "test2", "test"}, {true, false, true, true},
{"test", "test2"}, {}, {2, 1});

CheckValueCounts<StringType, std::string>(
&this->ctx_, utf8(), {"test", "", "test2", "test"}, {true, false, true, true},
{"test", "test2"}, {}, {2, 1});
}

TEST_F(TestHashKernel, DictEncodeBinary) {
CheckDictEncode<BinaryType, std::string>(
&this->ctx_, binary(), {"test", "", "test2", "test", "baz"},
Expand All @@ -214,6 +306,7 @@ TEST_F(TestHashKernel, BinaryResizeTable) {
vector<std::string> values;
vector<std::string> uniques;
vector<int32_t> indices;
vector<int64_t> counts;
char buf[20] = "test";

for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
Expand All @@ -224,15 +317,21 @@ TEST_F(TestHashKernel, BinaryResizeTable) {

if (i < kTotalValues) {
uniques.push_back(values.back());
counts.push_back(kRepeats);
}
indices.push_back(index);
}

CheckUnique<BinaryType, std::string>(&this->ctx_, binary(), values, {}, uniques, {});
CheckValueCounts<BinaryType, std::string>(&this->ctx_, binary(), values, {}, uniques,
{}, counts);

CheckDictEncode<BinaryType, std::string>(&this->ctx_, binary(), values, {}, uniques, {},
indices);

CheckUnique<StringType, std::string>(&this->ctx_, utf8(), values, {}, uniques, {});
CheckValueCounts<StringType, std::string>(&this->ctx_, utf8(), values, {}, uniques, {},
counts);
CheckDictEncode<StringType, std::string>(&this->ctx_, utf8(), values, {}, uniques, {},
indices);
}
Expand Down Expand Up @@ -291,6 +390,15 @@ TEST_F(TestHashKernel, UniqueDecimal) {
{true, false, true, true}, expected, {});
}

TEST_F(TestHashKernel, ValueCountsDecimal) {
vector<Decimal128> values{12, 12, 11, 12};
vector<Decimal128> expected{12, 11};

CheckValueCounts<Decimal128Type, Decimal128>(&this->ctx_, decimal(2, 0), values,
{true, false, true, true}, expected, {},
{2, 1});
}

TEST_F(TestHashKernel, DictEncodeDecimal) {
vector<Decimal128> values{12, 12, 11, 12, 13};
vector<Decimal128> expected{12, 11, 13};
Expand All @@ -300,6 +408,20 @@ TEST_F(TestHashKernel, DictEncodeDecimal) {
{}, {0, 0, 1, 0, 2});
}

/* TODO(ARROW-4124): Determine if we wan to do something that is reproducable with floats.
TEST_F(TestHashKernel, ValueCountsFloat) {

// No nulls
CheckValueCounts<FloatType, float>(&this->ctx_, float32(), {1.0f, 0.0f, -0.0f,
std::nan("1"), std::nan("2") },
{}, {0.0f, 1.0f, std::nan("1")}, {}, {});

CheckValueCounts<DoubleType, double>(&this->ctx_, float64(), {1.0f, 0.0f, -0.0f,
std::nan("1"), std::nan("2") },
{}, {0.0f, 1.0f, std::nan("1")}, {}, {});
}
*/

TEST_F(TestHashKernel, ChunkedArrayInvoke) {
vector<std::string> values1 = {"foo", "bar", "foo"};
vector<std::string> values2 = {"bar", "baz", "quuux", "foo"};
Expand All @@ -311,6 +433,9 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
vector<std::string> dict_values = {"foo", "bar", "baz", "quuux"};
auto ex_dict = _MakeArray<StringType, std::string>(type, dict_values, {});

vector<int64_t> counts = {3, 2, 1, 1};
auto ex_counts = _MakeArray<Int64Type, int64_t>(int64(), counts, {});

ArrayVector arrays = {a1, a2};
auto carr = std::make_shared<ChunkedArray>(arrays);

Expand All @@ -329,6 +454,14 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
std::make_shared<DictionaryArray>(dict_type, i2)};
auto dict_carr = std::make_shared<ChunkedArray>(dict_arrays);

// Unique counts
shared_ptr<Array> counts_array;
ASSERT_OK(ValueCounts(&this->ctx_, carr, &counts_array));
auto counts_struct = std::dynamic_pointer_cast<StructArray>(counts_array);
ASSERT_ARRAYS_EQUAL(*ex_dict, *counts_struct->field(0));
ASSERT_ARRAYS_EQUAL(*ex_counts, *counts_struct->field(1));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems reasonable to expect a consistent field position

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this in relation to the YAGNI comment above?


// Dictionary encode
Datum encoded_out;
ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out));
ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
Expand Down
Loading