diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 77404cd7025..f366645cd5c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -161,6 +161,7 @@ struct Field { std::string ToString() const; }; +typedef std::shared_ptr FieldPtr; template struct PrimitiveType : public DataType { diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 78036d4bf57..bcb0ec49090 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -23,6 +23,7 @@ #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -66,6 +67,20 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, out->reset(new ListBuilder(pool, value_builder)); return Status::OK(); } + + case Type::STRUCT: { + std::vector& fields = type->children_; + std::vector> values_builder; + + for (auto it : fields) { + std::shared_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, it->type, &builder)); + values_builder.push_back(builder); + } + out->reset(new StructBuilder(pool, type, values_builder)); + return Status::OK(); + } + default: return Status::NotImplemented(type->ToString()); } diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 43c0018c67e..d0370840ca1 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -20,13 +20,14 @@ #include #include - +#include namespace arrow { class Array; class ArrayBuilder; class Buffer; struct DataType; +struct Field; class MemoryPool; class Status; diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 79d560e19bc..d2bd2971d04 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -21,7 +21,16 @@ #include "gtest/gtest.h" +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/types/construct.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/types/struct.h" +#include "arrow/types/test-common.h" +#include "arrow/util/status.h" using std::shared_ptr; using std::string; @@ -52,4 +61,327 @@ TEST(TestStructType, Basics) { // TODO(wesm): out of bounds for field(...) } +void ValidateBasicStructArray(const StructArray* result, + const vector& struct_is_valid, const vector& list_values, + const vector& list_is_valid, const vector& list_lengths, + const vector& list_offsets, const vector& int_values) { + ASSERT_EQ(4, result->length()); + ASSERT_OK(result->Validate()); + + auto list_char_arr = static_cast(result->field(0).get()); + auto char_arr = static_cast(list_char_arr->values().get()); + auto int32_arr = static_cast(result->field(1).get()); + + ASSERT_EQ(0, result->null_count()); + ASSERT_EQ(1, list_char_arr->null_count()); + ASSERT_EQ(0, int32_arr->null_count()); + + // List + ASSERT_EQ(4, list_char_arr->length()); + ASSERT_EQ(10, list_char_arr->values()->length()); + for (size_t i = 0; i < list_offsets.size(); ++i) { + ASSERT_EQ(list_offsets[i], list_char_arr->offsets()[i]); + } + for (size_t i = 0; i < list_values.size(); ++i) { + ASSERT_EQ(list_values[i], char_arr->Value(i)); + } + + // Int32 + ASSERT_EQ(4, int32_arr->length()); + for (size_t i = 0; i < int_values.size(); ++i) { + ASSERT_EQ(int_values[i], int32_arr->Value(i)); + } +} + +// ---------------------------------------------------------------------------------- +// Struct test +class TestStructBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + + auto int32_type = TypePtr(new Int32Type()); + auto char_type = TypePtr(new Int8Type()); + auto list_type = TypePtr(new ListType(char_type)); + + std::vector types = {list_type, int32_type}; + std::vector fields; + fields.push_back(FieldPtr(new Field("list", list_type))); + fields.push_back(FieldPtr(new Field("int", int32_type))); + + type_ = TypePtr(new StructType(fields)); + value_fields_ = fields; + + std::shared_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + + builder_ = std::dynamic_pointer_cast(tmp); + ASSERT_EQ(2, builder_->field_builders().size()); + } + + void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } + + protected: + std::vector value_fields_; + TypePtr type_; + + std::shared_ptr builder_; + std::shared_ptr result_; +}; + +TEST_F(TestStructBuilder, TestAppendNull) { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); + ASSERT_EQ(2, builder_->field_builders().size()); + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_EQ(2, list_vb->length()); + + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_EQ(2, int_vb->length()); + + Done(); + + ASSERT_OK(result_->Validate()); + + ASSERT_EQ(2, result_->fields().size()); + ASSERT_EQ(2, result_->length()); + ASSERT_EQ(2, result_->field(0)->length()); + ASSERT_EQ(2, result_->field(1)->length()); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + ASSERT_TRUE(result_->field(0)->IsNull(0)); + ASSERT_TRUE(result_->field(0)->IsNull(1)); + ASSERT_TRUE(result_->field(1)->IsNull(0)); + ASSERT_TRUE(result_->field(1)->IsNull(1)); + + ASSERT_EQ(Type::LIST, result_->field(0)->type_enum()); + ASSERT_EQ(Type::INT32, result_->field(1)->type_enum()); +} + +TEST_F(TestStructBuilder, TestBasics) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6, 10}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_EQ(2, builder_->field_builders().size()); + + EXPECT_OK(builder_->Resize(list_lengths.size())); + EXPECT_OK(char_vb->Resize(list_values.size())); + EXPECT_OK(int_vb->Resize(int_values.size())); + + int pos = 0; + for (size_t i = 0; i < list_lengths.size(); ++i) { + ASSERT_OK(list_vb->Append(list_is_valid[i] > 0)); + int_vb->UnsafeAppend(int_values[i]); + for (int j = 0; j < list_lengths[i]; ++j) { + char_vb->UnsafeAppend(list_values[pos++]); + } + } + + for (size_t i = 0; i < struct_is_valid.size(); ++i) { + ASSERT_OK(builder_->Append(struct_is_valid[i] > 0)); + } + + Done(); + + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} + +TEST_F(TestStructBuilder, BulkAppend) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + Done(); + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} + +TEST_F(TestStructBuilder, BulkAppendInvalid) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 0, 1, 1}; // should be 1, 1, 1, 1 + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + Done(); + // Even null bitmap of the parent Struct is not valid, Validate() will ignore it. + ASSERT_OK(result_->Validate()); +} + +TEST_F(TestStructBuilder, TestEquality) { + ArrayPtr array, equal_array; + ArrayPtr unequal_bitmap_array, unequal_offsets_array, unequal_values_array; + + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; + + vector unequal_int_values = {4, 2, 3, 1}; + vector unequal_list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'l', 'u', 'c', 'y'}; + vector unequal_list_offsets = {0, 3, 4, 6}; + vector unequal_list_is_valid = {1, 1, 1, 1}; + vector unequal_struct_is_valid = {1, 0, 0, 1}; + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + // setup two equal arrays, one of which takes an unequal bitmap + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with the unequal bitmap + builder_->Append(unequal_struct_is_valid.size(), unequal_struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + unequal_bitmap_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with unequal offsets + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(unequal_list_offsets.data(), unequal_list_offsets.size(), + unequal_list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + unequal_offsets_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup anunequal one with unequal values + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : unequal_list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : unequal_int_values) { + int_vb->UnsafeAppend(value); + } + unequal_values_array = builder_->Finish(); + + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(equal_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_values_array)); + EXPECT_FALSE(unequal_values_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_offsets_array)); + EXPECT_FALSE(unequal_offsets_array->Equals(unequal_bitmap_array)); + + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 4, 0, equal_array)); + EXPECT_TRUE(array->RangeEquals(3, 4, 3, unequal_bitmap_array)); + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array)); + EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array)); + EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array)); +} + +TEST_F(TestStructBuilder, TestZeroLength) { + // All buffers are null + Done(); + ASSERT_OK(result_->Validate()); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 04a277a86fa..e8176f08268 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,4 +17,74 @@ #include "arrow/types/struct.h" -namespace arrow {} // namespace arrow +#include + +namespace arrow { + +bool StructArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + if (null_count_ != arr->null_count()) { return false; } + return RangeEquals(0, length_, 0, arr); +} + +bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (Type::STRUCT != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + + bool equal_fields = true; + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + if (IsNull(i) != arr->IsNull(o_i)) { return false; } + if (IsNull(i)) continue; + for (size_t j = 0; j < field_arrays_.size(); ++j) { + // TODO: really we should be comparing stretches of non-null data rather + // than looking at one value at a time. + equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other->field(j)); + if (!equal_fields) { return false; } + } + } + + return true; +} + +Status StructArray::Validate() const { + if (length_ < 0) { return Status::Invalid("Length was negative"); } + + if (null_count() > length_) { + return Status::Invalid("Null count exceeds the length of this struct"); + } + + if (field_arrays_.size() > 0) { + // Validate fields + int32_t array_length = field_arrays_[0]->length(); + size_t idx = 0; + for (auto it : field_arrays_) { + if (it->length() != array_length) { + std::stringstream ss; + ss << "Length is not equal from field " << it->type()->ToString() + << " at position {" << idx << "}"; + return Status::Invalid(ss.str()); + } + + const Status child_valid = it->Validate(); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString() << " at position {" << idx + << "}"; + return Status::Invalid(ss.str()); + } + ++idx; + } + + if (array_length > 0 && array_length != length_) { + return Status::Invalid("Struct's length is not equal to its child arrays"); + } + } + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 17e32993bf9..78afd29eb8d 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -23,7 +23,102 @@ #include #include "arrow/type.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" -namespace arrow {} // namespace arrow +namespace arrow { + +class StructArray : public Array { + public: + StructArray(const TypePtr& type, int32_t length, std::vector& field_arrays, + int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) + : Array(type, length, null_count, null_bitmap) { + type_ = type; + field_arrays_ = field_arrays; + } + + Status Validate() const override; + + virtual ~StructArray() {} + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. + const std::shared_ptr& field(int32_t pos) const { + DCHECK_GT(field_arrays_.size(), 0); + return field_arrays_[pos]; + } + const std::vector& fields() const { return field_arrays_; } + + bool EqualsExact(const StructArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const override; + + protected: + // The child arrays corresponding to each field of the struct data type. + std::vector field_arrays_; +}; + +// --------------------------------------------------------------------------------- +// StructArray builder +// Append, Resize and Reserve methods are acting on StructBuilder. +// Please make sure all these methods of all child-builders' are consistently +// called to maintain data-structure consistency. +class StructBuilder : public ArrayBuilder { + public: + StructBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::vector>& field_builders) + : ArrayBuilder(pool, type) { + field_builders_ = field_builders; + } + + // Null bitmap is of equal length to every child field, and any zero byte + // will be considered as a null for that field, but users must using app- + // end methods or advance methods of the child builders' independently to + // insert data. + Status Append(int32_t length, const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + std::shared_ptr Finish() override { + std::vector fields; + for (auto it : field_builders_) { + fields.push_back(it->Finish()); + } + + auto result = + std::make_shared(type_, length_, fields, null_count_, null_bitmap_); + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return result; + } + + // Append an element to the Struct. All child-builders' Append method must + // be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() { return Append(false); } + + const std::shared_ptr field_builder(int pos) const { + DCHECK_GT(field_builders_.size(), 0); + return field_builders_[pos]; + } + const std::vector>& field_builders() const { + return field_builders_; + } + + protected: + std::vector> field_builders_; +}; + +} // namespace arrow #endif // ARROW_TYPES_STRUCT_H