From c885e90b78557e16f16dcddaec9983497f93d322 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sun, 10 Mar 2019 23:21:32 -0700 Subject: [PATCH] implement LargeListArray, LargeListBuilder and IPC methods --- cpp/src/arrow/array-list-test.cc | 183 ++++++++++++++++-- cpp/src/arrow/array.cc | 120 +++++++++--- cpp/src/arrow/array.h | 60 ++++++ cpp/src/arrow/array/builder_nested.cc | 80 ++++++++ cpp/src/arrow/array/builder_nested.h | 49 +++++ cpp/src/arrow/builder.cc | 8 + cpp/src/arrow/compare.cc | 49 +++-- cpp/src/arrow/ipc/json-internal.cc | 41 +++- cpp/src/arrow/ipc/metadata-internal.cc | 13 ++ cpp/src/arrow/ipc/read-write-test.cc | 15 +- cpp/src/arrow/ipc/reader.cc | 14 ++ cpp/src/arrow/ipc/test-common.h | 67 ++++++- cpp/src/arrow/ipc/writer.cc | 32 ++- cpp/src/arrow/pretty_print.cc | 7 +- cpp/src/arrow/python/arrow_to_pandas.cc | 4 + cpp/src/arrow/type.cc | 14 ++ cpp/src/arrow/type.h | 38 ++++ cpp/src/arrow/type_fwd.h | 4 + cpp/src/arrow/type_traits.h | 8 + cpp/src/arrow/visitor.cc | 2 + cpp/src/arrow/visitor.h | 2 + cpp/src/arrow/visitor_inline.h | 1 + cpp/src/gandiva/expression_registry.cc | 1 + .../gandiva/jni/expression_registry_helper.cc | 1 + cpp/src/parquet/arrow/writer.cc | 2 + format/Schema.fbs | 9 +- 26 files changed, 733 insertions(+), 91 deletions(-) diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index cb848962baf..8274f2cb228 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -69,13 +69,40 @@ class TestListArray : public TestBuilder { std::shared_ptr result_; }; -TEST_F(TestListArray, Equality) { +class TestLargeListArray : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + + value_type_ = int32(); + type_ = large_list(value_type_); + + std::unique_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_.reset(checked_cast(tmp.release())); + } + + void Done() { + std::shared_ptr out; + FinishAndCheckPadding(builder_.get(), &out); + result_ = std::dynamic_pointer_cast(out); + } + + protected: + std::shared_ptr value_type_; + + std::shared_ptr builder_; + std::shared_ptr result_; +}; + +template +void TestListEquality(std::shared_ptr builder_) { Int32Builder* vb = checked_cast(builder_->value_builder()); std::shared_ptr array, equal_array, unequal_array; - vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; - vector unequal_offsets = {0, 1, 4, 7}; + vector unequal_offsets = {0, 1, 4, 7}; vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; // setup two equal arrays @@ -128,9 +155,17 @@ TEST_F(TestListArray, Equality) { ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice)); } +TEST_F(TestListArray, Equality) { TestListEquality(builder_); } + +TEST_F(TestLargeListArray, Equality) { + TestListEquality(builder_); +} + TEST_F(TestListArray, TestResize) {} -TEST_F(TestListArray, TestFromArrays) { +template +void TestListFromArrays(MemoryPool* pool_, std::shared_ptr builder_) { std::shared_ptr offsets1, offsets2, offsets3, offsets4, values; std::vector offsets_is_valid3 = {true, false, true, true}; @@ -138,26 +173,28 @@ TEST_F(TestListArray, TestFromArrays) { std::vector values_is_valid = {true, false, true, true, true, true}; - std::vector offset1_values = {0, 2, 2, 6}; - std::vector offset2_values = {0, 2, 6, 6}; + std::vector offset1_values = {0, 2, 2, 6}; + std::vector offset2_values = {0, 2, 6, 6}; std::vector values_values = {0, 1, 2, 3, 4, 5}; const int length = 3; - ArrayFromVector(offset1_values, &offsets1); - ArrayFromVector(offset2_values, &offsets2); + ArrayFromVector(offset1_values, &offsets1); + ArrayFromVector(offset2_values, &offsets2); - ArrayFromVector(offsets_is_valid3, offset1_values, &offsets3); - ArrayFromVector(offsets_is_valid4, offset2_values, &offsets4); + ArrayFromVector(offsets_is_valid3, offset1_values, + &offsets3); + ArrayFromVector(offsets_is_valid4, offset2_values, + &offsets4); ArrayFromVector(values_is_valid, values_values, &values); auto list_type = list(int8()); std::shared_ptr list1, list3, list4; - ASSERT_OK(ListArray::FromArrays(*offsets1, *values, pool_, &list1)); - ASSERT_OK(ListArray::FromArrays(*offsets3, *values, pool_, &list3)); - ASSERT_OK(ListArray::FromArrays(*offsets4, *values, pool_, &list4)); + ASSERT_OK(ListType::FromArrays(*offsets1, *values, pool_, &list1)); + ASSERT_OK(ListType::FromArrays(*offsets3, *values, pool_, &list3)); + ASSERT_OK(ListType::FromArrays(*offsets4, *values, pool_, &list4)); ListArray expected1(list_type, length, offsets1->data()->buffers[1], values, offsets1->data()->buffers[0], 0); @@ -187,6 +224,15 @@ TEST_F(TestListArray, TestFromArrays) { ASSERT_RAISES(Invalid, ListArray::FromArrays(*values, *offsets1, pool_, &tmp)); } +TEST_F(TestListArray, TestFromArrays) { + TestListFromArrays(pool_, builder_); +} + +TEST_F(TestLargeListArray, TestFromArrays) { + TestListFromArrays(pool_, + builder_); +} + TEST_F(TestListArray, TestAppendNull) { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); @@ -207,14 +253,35 @@ TEST_F(TestListArray, TestAppendNull) { ASSERT_NE(nullptr, values->data()->buffers[1]); } -void ValidateBasicListArray(const ListArray* result, const vector& values, +TEST_F(TestLargeListArray, TestAppendNull) { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); + + Done(); + + ASSERT_OK(ValidateArray(*result_)); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + + ASSERT_EQ(0, result_->raw_value_offsets()[0]); + ASSERT_EQ(0, result_->value_offset(1)); + ASSERT_EQ(0, result_->value_offset(2)); + + auto values = result_->values(); + ASSERT_EQ(0, values->length()); + // Values buffer should be non-null + ASSERT_NE(nullptr, values->data()->buffers[1]); +} + +template +void ValidateBasicListArray(const ListArrayType* result, const vector& values, const vector& is_valid) { ASSERT_OK(ValidateArray(*result)); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); ASSERT_EQ(3, result->length()); - vector ex_offsets = {0, 3, 3, 7}; + vector ex_offsets = {0, 3, 3, 7}; for (size_t i = 0; i < ex_offsets.size(); ++i) { ASSERT_EQ(ex_offsets[i], result->value_offset(i)); } @@ -250,7 +317,29 @@ TEST_F(TestListArray, TestBasics) { } Done(); - ValidateBasicListArray(result_.get(), values, is_valid); + ValidateBasicListArray(result_.get(), values, is_valid); +} + +TEST_F(TestLargeListArray, TestBasics) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_valid = {1, 0, 1}; + + Int32Builder* vb = checked_cast(builder_->value_builder()); + + ASSERT_OK(builder_->Reserve(lengths.size())); + ASSERT_OK(vb->Reserve(values.size())); + + int pos = 0; + for (size_t i = 0; i < lengths.size(); ++i) { + ASSERT_OK(builder_->Append(is_valid[i] > 0)); + for (int j = 0; j < lengths[i]; ++j) { + ASSERT_OK(vb->Append(values[pos++])); + } + } + + Done(); + ValidateBasicListArray(result_.get(), values, is_valid); } TEST_F(TestListArray, BulkAppend) { @@ -267,7 +356,24 @@ TEST_F(TestListArray, BulkAppend) { ASSERT_OK(vb->Append(value)); } Done(); - ValidateBasicListArray(result_.get(), values, is_valid); + ValidateBasicListArray(result_.get(), values, is_valid); +} + +TEST_F(TestLargeListArray, BulkAppend) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_valid = {1, 0, 1}; + vector offsets = {0, 3, 3}; + + Int32Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Reserve(values.size())); + + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + for (int32_t value : values) { + ASSERT_OK(vb->Append(value)); + } + Done(); + ValidateBasicListArray(result_.get(), values, is_valid); } TEST_F(TestListArray, BulkAppendInvalid) { @@ -290,12 +396,38 @@ TEST_F(TestListArray, BulkAppendInvalid) { ASSERT_RAISES(Invalid, ValidateArray(*result_)); } +TEST_F(TestLargeListArray, BulkAppendInvalid) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_null = {0, 1, 0}; + vector is_valid = {1, 0, 1}; + vector offsets = {0, 2, 4}; // should be 0, 3, 3 given the is_null array + + Int32Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Reserve(values.size())); + + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + for (int32_t value : values) { + ASSERT_OK(vb->Append(value)); + } + + Done(); + ASSERT_RAISES(Invalid, ValidateArray(*result_)); +} + TEST_F(TestListArray, TestZeroLength) { // All buffers are null Done(); ASSERT_OK(ValidateArray(*result_)); } +TEST_F(TestLargeListArray, TestZeroLength) { + // All buffers are null + Done(); + ASSERT_OK(ValidateArray(*result_)); +} + TEST_F(TestListArray, TestBuilderPreserveFieleName) { auto list_type_with_name = list(field("counts", int32())); @@ -313,4 +445,21 @@ TEST_F(TestListArray, TestBuilderPreserveFieleName) { ASSERT_EQ("counts", type.value_field()->name()); } +TEST_F(TestLargeListArray, TestBuilderPreserveFieleName) { + auto list_type_with_name = large_list(field("counts", int32())); + + std::unique_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, list_type_with_name, &tmp)); + builder_.reset(checked_cast(tmp.release())); + + vector values = {1, 2, 4, 8}; + ASSERT_OK(builder_->AppendValues(values.data(), values.size())); + + std::shared_ptr list_array; + ASSERT_OK(builder_->Finish(&list_array)); + + const auto& type = checked_cast(*list_array->type()); + ASSERT_EQ("counts", type.value_field()->name()); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 73d863eb874..545c6212956 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -183,42 +183,27 @@ BooleanArray::BooleanArray(int64_t length, const std::shared_ptr& data, // ---------------------------------------------------------------------- // ListArray -ListArray::ListArray(const std::shared_ptr& data) { - DCHECK_EQ(data->type->id(), Type::LIST); - SetData(data); -} - -ListArray::ListArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, - const std::shared_ptr& values, - const std::shared_ptr& null_bitmap, int64_t null_count, - int64_t offset) { - auto internal_data = - ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); - internal_data->child_data.emplace_back(values->data()); - SetData(internal_data); -} - -Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, - std::shared_ptr* out) { +template +Status ListArrayFromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr* out) { if (offsets.length() == 0) { return Status::Invalid("List offsets must have non-zero length"); } - if (offsets.type_id() != Type::INT32) { + if (offsets.type_id() != type) { return Status::Invalid("List offsets must be signed int32"); } BufferVector buffers = {}; - const auto& typed_offsets = checked_cast(offsets); + const auto& typed_offsets = checked_cast(offsets); const int64_t num_offsets = offsets.length(); if (offsets.null_count() > 0) { std::shared_ptr clean_offsets, clean_valid_bits; - RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets)); + RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(OffsetType), &clean_offsets)); // Copy valid bits, zero out the bit for the final offset RETURN_NOT_OK(offsets.null_bitmap()->Copy(0, BitUtil::BytesForBits(num_offsets - 1), @@ -226,12 +211,12 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets); buffers.emplace_back(std::move(clean_valid_bits)); - const int32_t* raw_offsets = typed_offsets.raw_values(); - auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); + const OffsetType* raw_offsets = typed_offsets.raw_values(); + auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); // Must work backwards so we can tell how many values were in the last non-null value DCHECK(offsets.IsValid(num_offsets - 1)); - int32_t current_offset = raw_offsets[num_offsets - 1]; + OffsetType current_offset = raw_offsets[num_offsets - 1]; for (int64_t i = num_offsets - 1; i >= 0; --i) { if (offsets.IsValid(i)) { current_offset = raw_offsets[i]; @@ -254,6 +239,28 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo return Status::OK(); } +ListArray::ListArray(const std::shared_ptr& data) { + DCHECK_EQ(data->type->id(), Type::LIST); + SetData(data); +} + +ListArray::ListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) { + auto internal_data = + ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); + internal_data->child_data.emplace_back(values->data()); + SetData(internal_data); +} + +Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr* out) { + return ListArrayFromArrays(offsets, values, pool, + out); +} + void ListArray::SetData(const std::shared_ptr& data) { this->Array::SetData(data); DCHECK_EQ(data->buffers.size(), 2); @@ -277,6 +284,54 @@ std::shared_ptr ListArray::value_type() const { std::shared_ptr ListArray::values() const { return values_; } +// ---------------------------------------------------------------------- +// LargeListArray + +LargeListArray::LargeListArray(const std::shared_ptr& data) { + DCHECK_EQ(data->type->id(), Type::LARGE_LIST); + SetData(data); +} + +LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset) { + auto internal_data = + ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); + internal_data->child_data.emplace_back(values->data()); + SetData(internal_data); +} + +Status LargeListArray::FromArrays(const Array& offsets, const Array& values, + MemoryPool* pool, std::shared_ptr* out) { + return ListArrayFromArrays(offsets, values, pool, + out); +} + +void LargeListArray::SetData(const std::shared_ptr& data) { + this->Array::SetData(data); + DCHECK_EQ(data->buffers.size(), 2); + + auto value_offsets = data->buffers[1]; + raw_value_offsets_ = value_offsets == nullptr + ? nullptr + : reinterpret_cast(value_offsets->data()); + + DCHECK_EQ(data_->child_data.size(), 1); + values_ = MakeArray(data_->child_data[0]); +} + +const LargeListType* LargeListArray::list_type() const { + return checked_cast(data_->type.get()); +} + +std::shared_ptr LargeListArray::value_type() const { + return list_type()->value_type(); +} + +std::shared_ptr LargeListArray::values() const { return values_; } + // ---------------------------------------------------------------------- // String and binary @@ -764,7 +819,8 @@ struct ValidateVisitor { return Status::OK(); } - Status Visit(const ListArray& array) { + template + Status VisitList(const ListArrayType& array) { if (array.length() < 0) { return Status::Invalid("Length was negative"); } @@ -773,7 +829,7 @@ struct ValidateVisitor { if (array.length() && !value_offsets) { return Status::Invalid("value_offsets_ was null"); } - if (value_offsets->size() / static_cast(sizeof(int32_t)) < array.length()) { + if (value_offsets->size() / static_cast(sizeof(OffsetType)) < array.length()) { return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), " isn't large enough for length: ", array.length()); } @@ -782,7 +838,7 @@ struct ValidateVisitor { return Status::Invalid("values was null"); } - const int32_t last_offset = array.value_offset(array.length()); + const OffsetType last_offset = array.value_offset(array.length()); if (array.values()->length() != last_offset) { return Status::Invalid("Final offset invariant not equal to values length: ", last_offset, "!=", array.values()->length()); @@ -793,12 +849,12 @@ struct ValidateVisitor { return Status::Invalid("Child array invalid: ", child_valid.ToString()); } - int32_t prev_offset = array.value_offset(0); + OffsetType prev_offset = array.value_offset(0); if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } for (int64_t i = 1; i <= array.length(); ++i) { - int32_t current_offset = array.value_offset(i); + OffsetType current_offset = array.value_offset(i); if (array.IsNull(i - 1) && current_offset != prev_offset) { return Status::Invalid("Offset invariant failure at: ", i, " inconsistent value_offsets for null slot", @@ -814,6 +870,12 @@ struct ValidateVisitor { return Status::OK(); } + Status Visit(const ListArray& array) { return VisitList(array); } + + Status Visit(const LargeListArray& array) { + return VisitList(array); + } + Status Visit(const StructArray& array) { if (array.length() < 0) { return Status::Invalid("Length was negative"); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index bee133c017e..44c67e37aa6 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -514,6 +514,66 @@ class ARROW_EXPORT ListArray : public Array { std::shared_ptr values_; }; +// ---------------------------------------------------------------------- +// LargeListArray + +/// Concrete Array class for large list data (using 64-bit offsets) +class ARROW_EXPORT LargeListArray : public Array { + public: + using TypeClass = LargeListType; + + explicit LargeListArray(const std::shared_ptr& data); + + LargeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListArray from array of offsets and child value array + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int64 type + /// \param[in] values Array containing + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[out] out Will have length equal to offsets.length() - 1 + static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr* out); + + const LargeListType* list_type() const; + + /// \brief Return array object containing the list's values + std::shared_ptr values() const; + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return data_->buffers[1]; } + + std::shared_ptr value_type() const; + + /// Return pointer to raw value offsets accounting for any slice offset + const int64_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } + + // Neither of these functions will perform boundschecking + int64_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + int64_t value_length(int64_t i) const { + i += data_->offset; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } + + protected: + void SetData(const std::shared_ptr& data); + const int64_t* raw_value_offsets_; + + private: + std::shared_ptr values_; +}; + // ---------------------------------------------------------------------- // Binary and String diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 46637713c3e..293a8bea250 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -125,6 +125,86 @@ ArrayBuilder* ListBuilder::value_builder() const { return value_builder_.get(); } +LargeListBuilder::LargeListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type) + : ArrayBuilder(type ? type + : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), + offsets_builder_(pool), + value_builder_(value_builder) {} + +Status LargeListBuilder::AppendValues(const int64_t* offsets, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); +} + +Status LargeListBuilder::AppendNextOffset() { + return offsets_builder_.Append(value_builder_->length()); +} + +Status LargeListBuilder::Append(bool is_valid) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return AppendNextOffset(); +} + +Status LargeListBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // one more then requested for offsets + RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + return ArrayBuilder::Resize(capacity); +} + +Status LargeListBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + + std::shared_ptr items; + if (values_) { + items = values_->data(); + } else { + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + RETURN_NOT_OK(value_builder_->Resize(0)); + } + RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + } + + // If the type has not been specified in the constructor, infer it + // This is the case if the value_builder contains a DenseUnionBuilder + if (!arrow::internal::checked_cast(*type_).value_type()) { + type_ = std::static_pointer_cast( + std::make_shared(value_builder_->type())); + } + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_); + (*out)->child_data.emplace_back(std::move(items)); + Reset(); + return Status::OK(); +} + +void LargeListBuilder::Reset() { + ArrayBuilder::Reset(); + values_.reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); +} + +ArrayBuilder* LargeListBuilder::value_builder() const { + DCHECK(!values_) << "Using value builder is pointless when values_ is set"; + return value_builder_.get(); +} + // ---------------------------------------------------------------------- // Struct diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 19b0ad81b5a..eda4e772bca 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -77,6 +77,55 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { Status AppendNextOffset(); }; +/// \class LargeListBuilder +/// \brief Builder class for large variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offests and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT LargeListBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + LargeListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + const std::shared_ptr& type = NULLPTR); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int64_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true); + + Status AppendNull() { return Append(false); } + + ArrayBuilder* value_builder() const; + + protected: + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr values_; + + Status AppendNextOffset(); +}; + // ---------------------------------------------------------------------- // Struct diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 8d0ab194c24..764f29722ee 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -78,6 +78,14 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, out->reset(new ListBuilder(pool, std::move(value_builder), type)); return Status::OK(); } + case Type::LARGE_LIST: { + std::unique_ptr value_builder; + std::shared_ptr value_type = + internal::checked_cast(*type).value_type(); + RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); + out->reset(new LargeListBuilder(pool, std::move(value_builder), type)); + return Status::OK(); + } case Type::STRUCT: { const std::vector>& fields = type->children(); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index fcb16b5a378..8a6cbbce536 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -106,8 +106,9 @@ class RangeEqualsVisitor { return true; } - bool CompareLists(const ListArray& left) { - const auto& right = checked_cast(right_); + template + bool CompareLists(const ListArrayType& left) { + const auto& right = checked_cast(right_); const std::shared_ptr& left_values = left.values(); const std::shared_ptr& right_values = right.values(); @@ -119,10 +120,10 @@ class RangeEqualsVisitor { return false; } if (is_null) continue; - const int32_t begin_offset = left.value_offset(i); - const int32_t end_offset = left.value_offset(i + 1); - const int32_t right_begin_offset = right.value_offset(o_i); - const int32_t right_end_offset = right.value_offset(o_i + 1); + const OffsetType begin_offset = left.value_offset(i); + const OffsetType end_offset = left.value_offset(i + 1); + const OffsetType right_begin_offset = right.value_offset(o_i); + const OffsetType right_end_offset = right.value_offset(o_i + 1); // Underlying can't be equal if the size isn't equal if (end_offset - begin_offset != right_end_offset - right_begin_offset) { return false; @@ -274,7 +275,12 @@ class RangeEqualsVisitor { } Status Visit(const ListArray& left) { - result_ = CompareLists(left); + result_ = CompareLists(left); + return Status::OK(); + } + + Status Visit(const LargeListArray& left) { + result_ = CompareLists(left); return Status::OK(); } @@ -403,20 +409,21 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { return Status::OK(); } - template + template bool ValueOffsetsEqual(const ArrayType& left) { const auto& right = checked_cast(right_); if (left.offset() == 0 && right.offset() == 0) { return left.value_offsets()->Equals(*right.value_offsets(), - (left.length() + 1) * sizeof(int32_t)); + (left.length() + 1) * sizeof(OffsetType)); } else { // One of the arrays is sliced; logic is more complicated because the // value offsets are not both 0-based auto left_offsets = - reinterpret_cast(left.value_offsets()->data()) + left.offset(); + reinterpret_cast(left.value_offsets()->data()) + + left.offset(); auto right_offsets = - reinterpret_cast(right.value_offsets()->data()) + + reinterpret_cast(right.value_offsets()->data()) + right.offset(); for (int64_t i = 0; i < left.length() + 1; ++i) { @@ -431,7 +438,7 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { bool CompareBinary(const BinaryArray& left) { const auto& right = checked_cast(right_); - bool equal_offsets = ValueOffsetsEqual(left); + bool equal_offsets = ValueOffsetsEqual(left); if (!equal_offsets) { return false; } @@ -482,7 +489,21 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { Status Visit(const ListArray& left) { const auto& right = checked_cast(right_); - bool equal_offsets = ValueOffsetsEqual(left); + bool equal_offsets = ValueOffsetsEqual(left); + if (!equal_offsets) { + result_ = false; + return Status::OK(); + } + + result_ = left.values()->RangeEquals( + left.value_offset(0), left.value_offset(left.length()) - left.value_offset(0), + right.value_offset(0), right.values()); + return Status::OK(); + } + + Status Visit(const LargeListArray& left) { + const auto& right = checked_cast(right_); + bool equal_offsets = ValueOffsetsEqual(left); if (!equal_offsets) { result_ = false; return Status::OK(); @@ -664,6 +685,8 @@ class TypeEqualsVisitor { Status Visit(const ListType& left) { return VisitChildren(left); } + Status Visit(const LargeListType& left) { return VisitChildren(left); } + Status Visit(const StructType& left) { return VisitChildren(left); } Status Visit(const UnionType& left) { diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 0420c133dbd..64dc2573618 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -185,6 +185,7 @@ class SchemaWriter { template typename std::enable_if::value || std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value, void>::type WriteTypeMetadata(const T& type) {} @@ -329,6 +330,11 @@ class SchemaWriter { return Status::OK(); } + Status Visit(const LargeListType& type) { + WriteName("large_list", type); + return Status::OK(); + } + Status Visit(const StructType& type) { WriteName("struct", type); return Status::OK(); @@ -557,6 +563,13 @@ class ArrayWriter { return WriteChildren(type.children(), {array.values()}); } + Status Visit(const LargeListArray& array) { + WriteValidityField(array); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1); + const auto& type = checked_cast(*array.type()); + return WriteChildren(type.children(), {array.values()}); + } + Status Visit(const StructArray& array) { WriteValidityField(array); const auto& type = checked_cast(*array.type()); @@ -958,16 +971,17 @@ class ArrayReader { Status ParseTypeValues(const DataType& type); - Status GetValidityBuffer(const std::vector& is_valid, int32_t* null_count, + template + Status GetValidityBuffer(const std::vector& is_valid, LengthType* null_count, std::shared_ptr* validity_buffer) { - int length = static_cast(is_valid.size()); + int64_t length = is_valid.size(); std::shared_ptr out_buffer; RETURN_NOT_OK(AllocateEmptyBitmap(pool_, length, &out_buffer)); uint8_t* bitmap = out_buffer->mutable_data(); *null_count = 0; - for (int i = 0; i < length; ++i) { + for (int64_t i = 0; i < length; ++i) { if (!is_valid[i]) { ++(*null_count); continue; @@ -1161,6 +1175,27 @@ class ArrayReader { return Status::OK(); } + Status Visit(const LargeListType& type) { + int64_t null_count = 0; + std::shared_ptr validity_buffer; + RETURN_NOT_OK(GetValidityBuffer(is_valid_, &null_count, &validity_buffer)); + + const auto& json_offsets = obj_->FindMember("OFFSET"); + RETURN_NOT_ARRAY("OFFSET", json_offsets, *obj_); + std::shared_ptr offsets_buffer; + RETURN_NOT_OK(GetIntArray(json_offsets->value.GetArray(), length_ + 1, + &offsets_buffer)); + + std::vector> children; + RETURN_NOT_OK(GetChildren(*obj_, type, &children)); + DCHECK_EQ(children.size(), 1); + + result_ = std::make_shared(type_, length_, offsets_buffer, + children[0], validity_buffer, null_count); + + return Status::OK(); + } + Status Visit(const StructType& type) { int32_t null_count = 0; std::shared_ptr validity_buffer; diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 2589a10035f..f98f425bfcc 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -293,6 +293,12 @@ static Status ConcreteTypeFromFlatbuffer( } *out = std::make_shared(children[0]); return Status::OK(); + case flatbuf::Type_LargeList: + if (children.size() != 1) { + return Status::Invalid("LargeList must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); case flatbuf::Type_Struct_: *out = std::make_shared(children); return Status::OK(); @@ -545,6 +551,13 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const LargeListType& type) { + fb_type_ = flatbuf::Type_LargeList; + RETURN_NOT_OK(AppendChildFields(fbb_, type, &children_, dictionary_memo_)); + type_offset_ = flatbuf::CreateLargeList(fbb_).Union(); + return Status::OK(); + } + Status Visit(const StructType& type) { fb_type_ = flatbuf::Type_Struct_; RETURN_NOT_OK(AppendChildFields(fbb_, type, &children_, dictionary_memo_)); diff --git a/cpp/src/arrow/ipc/read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc index 6f4da28b3ff..fb093955c10 100644 --- a/cpp/src/arrow/ipc/read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -174,7 +174,9 @@ TEST_F(TestSchemaMetadata, NestedFields) { new StructType({field("k1", INT32), field("k2", INT32), field("k3", INT32)})); auto f1 = field("f1", type2); - Schema schema({f0, f1}); + auto f2 = field("f2", large_list(int32())); + + Schema schema({f0, f1, f2}); CheckRoundtrip(schema); } @@ -189,11 +191,12 @@ TEST_F(TestSchemaMetadata, KeyValueMetadata) { CheckRoundtrip(schema); } -#define BATCH_CASES() \ - ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, \ - &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, \ - &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, \ - &MakeDictionary, &MakeDates, &MakeTimestamps, &MakeTimes, \ +#define BATCH_CASES() \ + ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, \ + &MakeLargeListRecordBatch, &MakeNonNullRecordBatch, \ + &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, \ + &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, \ + &MakeDictionary, &MakeDates, &MakeTimestamps, &MakeTimes, \ &MakeFWBinary, &MakeNull, &MakeDecimal, &MakeBooleanBatch); static int g_file_number = 0; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index a33f07c859c..8e40c963514 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -234,6 +234,20 @@ class ArrayLoader { return LoadChildren(type.children()); } + Status Visit(const LargeListType& type) { + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon()); + RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1])); + + const int num_children = type.num_children(); + if (num_children != 1) { + return Status::Invalid("Wrong number of children: ", num_children); + } + + return LoadChildren(type.children()); + } + Status Visit(const StructType& type) { out_->buffers.resize(1); RETURN_NOT_OK(LoadCommon()); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 34213601763..e373771d4ae 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -65,7 +65,9 @@ static inline void CompareBatchColumnsDetailed(const RecordBatch& result, } const auto kListInt32 = list(int32()); +const auto kLargeListInt32 = large_list(int32()); const auto kListListInt32 = list(kListInt32); +const auto kLargeListLargeListInt32 = large_list(kLargeListInt32); Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out, uint32_t seed = 0) { @@ -77,8 +79,9 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool return Status::OK(); } -Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, +template +Status MakeRandomListArray(bool is_large, const std::shared_ptr& child_array, + int num_lists, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { // Create the null list values std::vector valid_lists(num_lists); @@ -88,8 +91,8 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li // Create list offsets const int max_list_size = 10; - std::vector list_sizes(num_lists, 0); - std::vector offsets( + std::vector list_sizes(num_lists, 0); + std::vector offsets( num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below. const uint32_t seed = static_cast(child_array->length()); @@ -98,29 +101,50 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li // make sure sizes are consistent with null std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), list_sizes.begin(), - [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); + [](IndexType size, IndexType valid) { return valid == 0 ? 0 : size; }); std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); // Force invariants - const int32_t child_length = static_cast(child_array->length()); + const IndexType child_length = static_cast(child_array->length()); offsets[0] = 0; std::replace_if(offsets.begin(), offsets.end(), - [child_length](int32_t offset) { return offset > child_length; }, + [child_length](IndexType offset) { return offset > child_length; }, child_length); } - offsets[num_lists] = static_cast(child_array->length()); + offsets[num_lists] = static_cast(child_array->length()); /// TODO(wesm): Implement support for nulls in ListArray::FromArrays std::shared_ptr null_bitmap, offsets_buffer; RETURN_NOT_OK(GetBitmapFromVector(valid_lists, &null_bitmap)); RETURN_NOT_OK(CopyBufferFromVector(offsets, pool, &offsets_buffer)); - *out = std::make_shared(list(child_array->type()), num_lists, offsets_buffer, - child_array, null_bitmap, kUnknownNullCount); + if (is_large) { + *out = std::make_shared(large_list(child_array->type()), num_lists, + offsets_buffer, child_array, null_bitmap, + kUnknownNullCount); + } else { + *out = + std::make_shared(list(child_array->type()), num_lists, offsets_buffer, + child_array, null_bitmap, kUnknownNullCount); + } return ValidateArray(**out); } +Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + return MakeRandomListArray(false, child_array, num_lists, include_nulls, pool, + out); +} + +Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + return MakeRandomListArray(true, child_array, num_lists, include_nulls, pool, + out); +} + typedef Status MakeRecordBatch(std::shared_ptr* out); Status MakeRandomBooleanArray(const int length, bool include_nulls, @@ -279,6 +303,29 @@ Status MakeListRecordBatch(std::shared_ptr* out) { return Status::OK(); } +Status MakeLargeListRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = field("f0", kLargeListInt32); + auto f1 = field("f1", kLargeListLargeListInt32); + auto f2 = field("f2", int32()); + auto schema = ::arrow::schema({f0, f1, f2}); + + // Example data + + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK( + MakeRandomLargeListArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK(MakeRandomLargeListArray(list_array, length, include_nulls, pool, + &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); + *out = RecordBatch::Make(schema, length, {list_array, list_list_array, flat_array}); + return Status::OK(); +} + Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { // Make the schema auto f0 = field("f0", kListInt32); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index ba9939016f1..1069b837650 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -218,7 +218,7 @@ class RecordBatchSerializer : public ArrayVisitor { return Status::OK(); } - template + template Status GetZeroBasedValueOffsets(const ArrayType& array, std::shared_ptr* value_offsets) { // Share slicing logic between ListArray and BinaryArray @@ -231,13 +231,14 @@ class RecordBatchSerializer : public ArrayVisitor { // b) slice the values array accordingly std::shared_ptr shifted_offsets; - RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(int32_t) * (array.length() + 1), + RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(OffsetType) * (array.length() + 1), &shifted_offsets)); - int32_t* dest_offsets = reinterpret_cast(shifted_offsets->mutable_data()); - const int32_t start_offset = array.value_offset(0); + OffsetType* dest_offsets = + reinterpret_cast(shifted_offsets->mutable_data()); + const OffsetType start_offset = array.value_offset(0); - for (int i = 0; i < array.length(); ++i) { + for (int64_t i = 0; i < array.length(); ++i) { dest_offsets[i] = array.value_offset(i) - start_offset; } // Final offset @@ -251,7 +252,8 @@ class RecordBatchSerializer : public ArrayVisitor { Status VisitBinary(const BinaryArray& array) { std::shared_ptr value_offsets; - RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); + Status s = GetZeroBasedValueOffsets(array, &value_offsets); + RETURN_NOT_OK(s); auto data = array.value_data(); int64_t total_data_bytes = 0; @@ -312,16 +314,18 @@ class RecordBatchSerializer : public ArrayVisitor { Status Visit(const BinaryArray& array) override { return VisitBinary(array); } - Status Visit(const ListArray& array) override { + template + Status VisitList(const ListArrayType& array) { std::shared_ptr value_offsets; - RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); + Status s = GetZeroBasedValueOffsets(array, &value_offsets); + RETURN_NOT_OK(s); out_->body_buffers.emplace_back(value_offsets); --max_recursion_depth_; std::shared_ptr values = array.values(); - int32_t values_offset = 0; - int32_t values_length = 0; + OffsetType values_offset = 0; + OffsetType values_length = 0; if (value_offsets) { values_offset = array.value_offset(0); values_length = array.value_offset(array.length()) - values_offset; @@ -336,6 +340,14 @@ class RecordBatchSerializer : public ArrayVisitor { return Status::OK(); } + Status Visit(const ListArray& array) override { + return VisitList(array); + } + + Status Visit(const LargeListArray& array) override { + return VisitList(array); + } + Status Visit(const StructArray& array) override { --max_recursion_depth_; for (int i = 0; i < array.num_fields(); ++i) { diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 40339772071..d80f477d18f 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -197,7 +197,9 @@ class ArrayPrinter : public PrettyPrinter { } template - inline typename std::enable_if::value, Status>::type + inline typename std::enable_if::value || + std::is_base_of::value, + Status>::type WriteDataValues(const T& array) { bool skip_comma = true; for (int64_t i = 0; i < array.length(); ++i) { @@ -233,7 +235,8 @@ class ArrayPrinter : public PrettyPrinter { typename std::enable_if::value || std::is_base_of::value || std::is_base_of::value || - std::is_base_of::value, + std::is_base_of::value || + std::is_base_of::value, Status>::type Visit(const T& array) { OpenArray(array); diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index f5810a7120d..fac40075868 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -1868,6 +1868,10 @@ class ArrowDeserializer { #undef CONVERTVALUES_LISTSLIKE_CASE } + Status Visit(const LargeListType& type) { + return Status::NotImplemented("large list type"); + } + Status Visit(const DictionaryType& type) { auto block = std::make_shared(options_, nullptr, col_->length()); RETURN_NOT_OK(block->Write(col_, 0, 0)); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index fd37726f39a..bea4fbec072 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -130,6 +130,12 @@ std::string ListType::ToString() const { return s.str(); } +std::string LargeListType::ToString() const { + std::stringstream s; + s << "large_list<" << value_field()->ToString() << ">"; + return s.str(); +} + std::string BinaryType::ToString() const { return std::string("binary"); } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -565,6 +571,14 @@ std::shared_ptr list(const std::shared_ptr& value_field) { return std::make_shared(value_field); } +std::shared_ptr large_list(const std::shared_ptr& value_type) { + return std::make_shared(value_type); +} + +std::shared_ptr large_list(const std::shared_ptr& value_field) { + return std::make_shared(value_field); +} + std::shared_ptr struct_(const std::vector>& fields) { return std::make_shared(fields); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 858093c4af4..725a5ac2ead 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -123,6 +123,9 @@ struct Type { /// A list of some logical data type LIST, + /// A list of some logical data type, using 64-bit offsets + LARGE_LIST, + /// Struct of logical types STRUCT, @@ -452,6 +455,33 @@ class ARROW_EXPORT ListType : public NestedType { std::string name() const override { return "list"; } }; +/// \brief Concrete type class for large list data +/// +/// List data is nested data where each value is a variable number of +/// child items. Lists can be recursively nested, for example +/// list(list(int32)). +class ARROW_EXPORT LargeListType : public NestedType { + public: + static constexpr Type::type type_id = Type::LARGE_LIST; + + // List can contain any other logical value type + explicit LargeListType(const std::shared_ptr& value_type) + : LargeListType(std::make_shared("item", value_type)) {} + + explicit LargeListType(const std::shared_ptr& value_field) + : NestedType(Type::LARGE_LIST) { + children_ = {value_field}; + } + + std::shared_ptr value_field() const { return children_[0]; } + + std::shared_ptr value_type() const { return children_[0]->type(); } + + std::string ToString() const override; + + std::string name() const override { return "large_list"; } +}; + /// \brief Concrete type class for variable-size binary data class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { public: @@ -890,6 +920,14 @@ std::shared_ptr list(const std::shared_ptr& value_type); ARROW_EXPORT std::shared_ptr list(const std::shared_ptr& value_type); +/// \brief Create a LargeListType instance from its child Field type +ARROW_EXPORT +std::shared_ptr large_list(const std::shared_ptr& value_type); + +/// \brief Create a LargeListType instance from its child DataType +ARROW_EXPORT +std::shared_ptr large_list(const std::shared_ptr& value_type); + /// \brief Create a TimestampType instance from its unit ARROW_EXPORT std::shared_ptr timestamp(TimeUnit::type unit); diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index b995b889e81..128ded66d38 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -77,6 +77,10 @@ class ListArray; class ListBuilder; struct ListScalar; +class LargeListType; +class LargeListArray; +class LargeListBuilder; + class StructType; class StructArray; class StructBuilder; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 0cea58483f4..1dd8d9f7b97 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -237,6 +237,14 @@ struct TypeTraits { constexpr static bool is_parameter_free = false; }; +template <> +struct TypeTraits { + using ArrayType = LargeListArray; + using BuilderType = LargeListBuilder; + using ScalarType = ListScalar; + constexpr static bool is_parameter_free = false; +}; + template struct CTypeTraits> : public TypeTraits { using ArrowType = ListType; diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index 76e52873bca..14f14ce595c 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -54,6 +54,7 @@ ARRAY_VISITOR_DEFAULT(Time64Array) ARRAY_VISITOR_DEFAULT(TimestampArray) ARRAY_VISITOR_DEFAULT(IntervalArray) ARRAY_VISITOR_DEFAULT(ListArray) +ARRAY_VISITOR_DEFAULT(LargeListArray) ARRAY_VISITOR_DEFAULT(StructArray) ARRAY_VISITOR_DEFAULT(UnionArray) ARRAY_VISITOR_DEFAULT(DictionaryArray) @@ -94,6 +95,7 @@ TYPE_VISITOR_DEFAULT(TimestampType) TYPE_VISITOR_DEFAULT(IntervalType) TYPE_VISITOR_DEFAULT(Decimal128Type) TYPE_VISITOR_DEFAULT(ListType) +TYPE_VISITOR_DEFAULT(LargeListType) TYPE_VISITOR_DEFAULT(StructType) TYPE_VISITOR_DEFAULT(UnionType) TYPE_VISITOR_DEFAULT(DictionaryType) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index d1e3e3a936e..68584370986 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -52,6 +52,7 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const IntervalArray& array); virtual Status Visit(const Decimal128Array& array); virtual Status Visit(const ListArray& array); + virtual Status Visit(const LargeListArray& array); virtual Status Visit(const StructArray& array); virtual Status Visit(const UnionArray& array); virtual Status Visit(const DictionaryArray& array); @@ -86,6 +87,7 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const IntervalType& type); virtual Status Visit(const Decimal128Type& type); virtual Status Visit(const ListType& type); + virtual Status Visit(const LargeListType& type); virtual Status Visit(const StructType& type); virtual Status Visit(const UnionType& type); virtual Status Visit(const DictionaryType& type); diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index e8b8c49e1b7..f9918e680a3 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -55,6 +55,7 @@ namespace arrow { ACTION(Time64); \ ACTION(Decimal128); \ ACTION(List); \ + ACTION(LargeList); \ ACTION(Struct); \ ACTION(Union); \ ACTION(Dictionary); \ diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index 8e667f8ad8a..4b0e94c5ddc 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -143,6 +143,7 @@ void ExpressionRegistry::AddArrowTypesToVector(arrow::Type::type& type, case arrow::Type::type::MAP: case arrow::Type::type::INTERVAL: case arrow::Type::type::LIST: + case arrow::Type::type::LARGE_LIST: case arrow::Type::type::STRUCT: case arrow::Type::type::UNION: case arrow::Type::type::DICTIONARY: diff --git a/cpp/src/gandiva/jni/expression_registry_helper.cc b/cpp/src/gandiva/jni/expression_registry_helper.cc index 94c48336cb7..87e59cb77e1 100644 --- a/cpp/src/gandiva/jni/expression_registry_helper.cc +++ b/cpp/src/gandiva/jni/expression_registry_helper.cc @@ -131,6 +131,7 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) case arrow::Type::type::MAP: case arrow::Type::type::INTERVAL: case arrow::Type::type::LIST: + case arrow::Type::type::LARGE_LIST: case arrow::Type::type::STRUCT: case arrow::Type::type::UNION: case arrow::Type::type::DICTIONARY: diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index fccf49be3a3..d70b1ae9f0c 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -51,6 +51,7 @@ using arrow::Field; using arrow::FixedSizeBinaryArray; using arrow::Int16Array; using arrow::Int16Builder; +using arrow::LargeListArray; using arrow::ListArray; using arrow::MemoryPool; using arrow::NumericArray; @@ -116,6 +117,7 @@ class LevelBuilder { " not supported yet"); \ } + NOT_IMPLEMENTED_VISIT(LargeList) NOT_IMPLEMENTED_VISIT(Struct) NOT_IMPLEMENTED_VISIT(Union) NOT_IMPLEMENTED_VISIT(Dictionary) diff --git a/format/Schema.fbs b/format/Schema.fbs index 9e52a8d5f9e..9141c8504ec 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -44,9 +44,15 @@ table Null { table Struct_ { } +/// List with 32-bit offsets table List { } +// List with 64-bit offsets. This type is optional and currently +// only supported by the C++ implementation. +table LargeList { +} + table FixedSizeList { /// Number of list items per value listSize: int; @@ -208,7 +214,8 @@ union Type { Union, FixedSizeBinary, FixedSizeList, - Map + Map, + LargeList } /// ----------------------------------------------------------------------