Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 214 additions & 0 deletions cpp/src/arrow/array-list-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ TEST_F(TestListArray, Equality) {
ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice));
}

TEST_F(TestListArray, ValuesEquality) {
auto type = list(int32());
auto left = ArrayFromJSON(type, "[[1, 2], [3], [0]]");
auto right = ArrayFromJSON(type, "[[1, 2], [3], [100000]]");
auto offset = 2;
EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset)));
}

TEST_F(TestListArray, TestResize) {}

TEST_F(TestListArray, TestFromArrays) {
Expand Down Expand Up @@ -332,4 +340,210 @@ TEST_F(TestListArray, TestBuilderPreserveFieleName) {
ASSERT_EQ("counts", type.value_field()->name());
}

// ----------------------------------------------------------------------
// FixedSizeList tests

class TestFixedSizeListArray : public TestBuilder {
public:
void SetUp() {
TestBuilder::SetUp();

value_type_ = int32();
type_ = fixed_size_list(value_type_, list_size());

std::unique_ptr<ArrayBuilder> tmp;
ASSERT_OK(MakeBuilder(pool_, type_, &tmp));
builder_.reset(checked_cast<FixedSizeListBuilder*>(tmp.release()));
}

void Done() {
std::shared_ptr<Array> out;
FinishAndCheckPadding(builder_.get(), &out);
result_ = std::dynamic_pointer_cast<FixedSizeListArray>(out);
}

protected:
static constexpr int32_t list_size() { return 2; }
std::shared_ptr<DataType> value_type_;

std::shared_ptr<FixedSizeListBuilder> builder_;
std::shared_ptr<FixedSizeListArray> result_;
};

TEST_F(TestFixedSizeListArray, Equality) {
Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());

std::shared_ptr<Array> array, equal_array, unequal_array;
std::vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6};
std::vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5, 2};

// setup two equal arrays
ASSERT_OK(builder_->AppendValues(equal_values.size() / list_size()));
ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));
ASSERT_OK(builder_->Finish(&array));

ASSERT_OK(builder_->AppendValues(equal_values.size() / list_size()));
ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));

ASSERT_OK(builder_->Finish(&equal_array));

// now an unequal one
ASSERT_OK(builder_->AppendValues(unequal_values.size() / list_size()));
ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size()));
ASSERT_OK(builder_->Finish(&unequal_array));

// Test array equality
AssertArraysEqual(*array, *array);
AssertArraysEqual(*array, *equal_array);
EXPECT_FALSE(equal_array->Equals(unequal_array));
EXPECT_FALSE(unequal_array->Equals(equal_array));

// Test range equality
EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
EXPECT_TRUE(array->RangeEquals(1, 3, 2, unequal_array));
}

TEST_F(TestFixedSizeListArray, TestAppendNull) {
ASSERT_OK(builder_->AppendNull());
ASSERT_OK(builder_->AppendNull());

Done();

ASSERT_OK(ValidateArray(*result_));
ASSERT_TRUE(result_->IsNull(0));
ASSERT_TRUE(result_->IsNull(1));

ASSERT_EQ(0, result_->value_offset(0));
ASSERT_EQ(list_size(), result_->value_offset(1));

auto values = result_->values();
ASSERT_EQ(list_size() * 2, values->length());
}

TEST_F(TestFixedSizeListArray, TestAppendNulls) {
ASSERT_OK(builder_->AppendNulls(3));

Done();

ASSERT_OK(ValidateArray(*result_));
ASSERT_EQ(result_->length(), 3);
ASSERT_EQ(result_->null_count(), 3);
ASSERT_TRUE(result_->IsNull(0));
ASSERT_TRUE(result_->IsNull(1));
ASSERT_TRUE(result_->IsNull(2));

ASSERT_EQ(0, result_->value_offset(0));
ASSERT_EQ(list_size(), result_->value_offset(1));
ASSERT_EQ(list_size() * 2, result_->value_offset(2));

auto values = result_->values();
ASSERT_EQ(list_size() * 3, values->length());
}

void ValidateBasicFixedSizeListArray(const FixedSizeListArray* result,
const std::vector<int32_t>& values,
const std::vector<uint8_t>& is_valid) {
ASSERT_OK(ValidateArray(*result));
ASSERT_EQ(1, result->null_count());
ASSERT_LE(result->values()->null_count(), 2);

ASSERT_EQ(3, result->length());
for (int32_t i = 0; i < 3; ++i) {
ASSERT_EQ(i * result->value_length(), result->value_offset(i));
}

for (int i = 0; i < result->length(); ++i) {
ASSERT_EQ(is_valid[i] == 0, result->IsNull(i));
}

ASSERT_EQ(result->length() * result->value_length(), result->values()->length());
auto varr = std::dynamic_pointer_cast<Int32Array>(result->values());

for (size_t i = 0; i < values.size(); ++i) {
if (is_valid[i / result->value_length()] == 0) {
continue;
}
ASSERT_EQ(values[i], varr->Value(i));
}
}

TEST_F(TestFixedSizeListArray, TestBasics) {
std::vector<int32_t> values = {0, 1, 2, 3, 4, 5};
std::vector<uint8_t> is_valid = {1, 0, 1};

Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());

int pos = 0;
for (size_t i = 0; i < values.size() / list_size(); ++i) {
if (is_valid[i] == 0) {
ASSERT_OK(builder_->AppendNull());
pos += list_size();
continue;
}
ASSERT_OK(builder_->Append());
for (int j = 0; j < list_size(); ++j) {
ASSERT_OK(vb->Append(values[pos++]));
}
}

Done();
ValidateBasicFixedSizeListArray(result_.get(), values, is_valid);
}

TEST_F(TestFixedSizeListArray, BulkAppend) {
std::vector<int32_t> values = {0, 1, 2, 3, 4, 5};
std::vector<uint8_t> is_valid = {1, 0, 1};

Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());

ASSERT_OK(builder_->AppendValues(values.size() / list_size(), is_valid.data()));
for (int32_t value : values) {
ASSERT_OK(vb->Append(value));
}
Done();
ValidateBasicFixedSizeListArray(result_.get(), values, is_valid);
}

TEST_F(TestFixedSizeListArray, BulkAppendInvalid) {
std::vector<int32_t> values = {0, 1, 2, 3, 4, 5};
std::vector<uint8_t> is_valid = {1, 0, 1};

Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());

ASSERT_OK(builder_->AppendValues(values.size() / list_size(), is_valid.data()));
for (int32_t value : values) {
ASSERT_OK(vb->Append(value));
}
for (int32_t value : values) {
ASSERT_OK(vb->Append(value));
}

Done();
ASSERT_RAISES(Invalid, ValidateArray(*result_));
}

TEST_F(TestFixedSizeListArray, TestZeroLength) {
// All buffers are null
Done();
ASSERT_OK(ValidateArray(*result_));
}

TEST_F(TestFixedSizeListArray, TestBuilderPreserveFieleName) {
auto list_type_with_name = fixed_size_list(field("counts", int32()), list_size());

std::unique_ptr<ArrayBuilder> tmp;
ASSERT_OK(MakeBuilder(pool_, list_type_with_name, &tmp));
builder_.reset(checked_cast<FixedSizeListBuilder*>(tmp.release()));

ASSERT_OK(builder_->AppendValues(4));

std::shared_ptr<Array> list_array;
ASSERT_OK(builder_->Finish(&list_array));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, this builds an invalid FixedSizeListArray? No values are appended to the child array.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does build an invalid array, but the point of this test is just to check that the field name is correctly propagated to the built array


const auto& type = checked_cast<FixedSizeListType&>(*list_array->type());
ASSERT_EQ("counts", type.value_field()->name());
}

} // namespace arrow
54 changes: 54 additions & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,44 @@ std::shared_ptr<DataType> ListArray::value_type() const {

std::shared_ptr<Array> ListArray::values() const { return values_; }

// ----------------------------------------------------------------------
// FixedSizeListArray

FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
}

FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<DataType>& type,
int64_t length,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset) {
auto internal_data = ArrayData::Make(type, length, {null_bitmap}, null_count, offset);
internal_data->child_data.emplace_back(values->data());
SetData(internal_data);
}

void FixedSizeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
DCHECK_EQ(data->type->id(), Type::FIXED_SIZE_LIST);
this->Array::SetData(data);

DCHECK(list_type()->value_type()->Equals(data->child_data[0]->type));
list_size_ = list_type()->list_size();

DCHECK_EQ(data_->child_data.size(), 1);
values_ = MakeArray(data_->child_data[0]);
}

const FixedSizeListType* FixedSizeListArray::list_type() const {
return checked_cast<const FixedSizeListType*>(data_->type.get());
}

std::shared_ptr<DataType> FixedSizeListArray::value_type() const {
return list_type()->value_type();
}

std::shared_ptr<Array> FixedSizeListArray::values() const { return values_; }

// ----------------------------------------------------------------------
// String and binary

Expand Down Expand Up @@ -839,6 +877,22 @@ struct ValidateVisitor {
return ValidateOffsets(array);
}

Status Visit(const FixedSizeListArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
}
if (!array.values()) {
return Status::Invalid("values was null");
}
if (array.values()->length() != array.length() * array.value_length()) {
return Status::Invalid(
"Values Length (", array.values()->length(), ") was not equal to the length (",
array.length(), ") multiplied by the list size (", array.value_length(), ")");
}

return Status::OK();
}

Status Visit(const StructArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
Expand Down
37 changes: 37 additions & 0 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,43 @@ class ARROW_EXPORT ListArray : public Array {
std::shared_ptr<Array> values_;
};

// ----------------------------------------------------------------------
// FixedSizeListArray

/// Concrete Array class for fixed size list data
class ARROW_EXPORT FixedSizeListArray : public Array {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it worth having common abstract class?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's not much opportunity for code reuse in arrays/builders. The only advantage I can think of would be easier reuse of code which consumes ListArray (but only code which doesn't directly access the offsets buffer). Those algorithms could use ArrayData BasicListArray::GetView(int64_t i) or similar. This seems like speculative generality to me; I'd be more open to doing it if you can think of existing code which would benefit from being refactored this way.

FixedSizeListScalar and ListScalar are identical but also trivial.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed with @bkietz. Also code reuse can be achieved through templating (we generally care about performing non-virtual calls anyway).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

public:
using TypeClass = FixedSizeListType;

explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);

FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

const FixedSizeListType* list_type() const;

/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;

std::shared_ptr<DataType> value_type() const;

// Neither of these functions will perform boundschecking
int32_t value_offset(int64_t i) const {
i += data_->offset;
return static_cast<int32_t>(list_size_ * i);
}
int32_t value_length(int64_t i = 0) const { return list_size_; }

protected:
void SetData(const std::shared_ptr<ArrayData>& data);
int32_t list_size_;

private:
std::shared_ptr<Array> values_;
};

// ----------------------------------------------------------------------
// Binary and String

Expand Down
Loading