diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index c3118e9847a..9adaf04c54b 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -37,308 +37,345 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; +using ListTypes = ::testing::Types; + // ---------------------------------------------------------------------- // List tests +template class TestListArray : public TestBuilder { public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + using OffsetType = typename TypeTraits::OffsetType; + using OffsetArrayType = typename TypeTraits::OffsetArrayType; + using OffsetBuilderType = typename TypeTraits::OffsetBuilderType; + void SetUp() { TestBuilder::SetUp(); - value_type_ = int32(); - type_ = list(value_type_); + value_type_ = int16(); + type_ = std::make_shared(value_type_); std::unique_ptr tmp; ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); - builder_.reset(checked_cast(tmp.release())); + builder_.reset(checked_cast(tmp.release())); } void Done() { std::shared_ptr out; FinishAndCheckPadding(builder_.get(), &out); - result_ = std::dynamic_pointer_cast(out); + result_ = std::dynamic_pointer_cast(out); } - protected: - std::shared_ptr value_type_; + void ValidateBasicListArray(const ArrayType* result, const std::vector& values, + const std::vector& is_valid) { + ASSERT_OK(ValidateArray(*result)); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); - std::shared_ptr builder_; - std::shared_ptr result_; -}; + ASSERT_EQ(3, result->length()); + std::vector ex_offsets = {0, 3, 3, 7}; + for (size_t i = 0; i < ex_offsets.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->value_offset(i)); + } -TEST_F(TestListArray, Equality) { - Int32Builder* vb = checked_cast(builder_->value_builder()); + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(is_valid[i] == 0, result->IsNull(i)); + } - std::shared_ptr array, equal_array, unequal_array; - std::vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; - std::vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; - std::vector unequal_offsets = {0, 1, 4, 7}; - std::vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; + ASSERT_EQ(7, result->values()->length()); + auto varr = std::dynamic_pointer_cast(result->values()); - // setup two equal arrays - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); - ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } + } - ASSERT_OK(builder_->Finish(&array)); - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); - ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); + void TestBasics() { + std::vector values = {0, 1, 2, 3, 4, 5, 6}; + std::vector lengths = {3, 0, 4}; + std::vector is_valid = {1, 0, 1}; - ASSERT_OK(builder_->Finish(&equal_array)); - // now an unequal one - ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size())); - ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); + Int16Builder* vb = checked_cast(builder_->value_builder()); - ASSERT_OK(builder_->Finish(&unequal_array)); + ASSERT_OK(builder_->Reserve(lengths.size())); + ASSERT_OK(vb->Reserve(values.size())); - // Test array equality - EXPECT_TRUE(array->Equals(array)); - EXPECT_TRUE(array->Equals(equal_array)); - EXPECT_TRUE(equal_array->Equals(array)); - EXPECT_FALSE(equal_array->Equals(unequal_array)); - EXPECT_FALSE(unequal_array->Equals(equal_array)); + int pos = 0; + for (size_t i = 0; i < lengths.size(); ++i) { + ASSERT_OK(builder_->Append(is_valid[i] > 0)); + for (int j = 0; j < lengths[i]; ++j) { + ASSERT_OK(vb->Append(values[pos++])); + } + } - // Test range equality - EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array)); - EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array)); - EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); - EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array)); + Done(); + ValidateBasicListArray(result_.get(), values, is_valid); + } - // Check with slices, ARROW-33 - std::shared_ptr slice, slice2; + void TestEquality() { + auto vb = checked_cast(builder_->value_builder()); - slice = array->Slice(2); - slice2 = array->Slice(2); - ASSERT_EQ(array->length() - 2, slice->length()); + std::shared_ptr array, equal_array, unequal_array; + std::vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + std::vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; + std::vector unequal_offsets = {0, 1, 4, 7}; + std::vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; - ASSERT_TRUE(slice->Equals(slice2)); - ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice)); + // setup two equal arrays + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); - // Chained slices - slice2 = array->Slice(1)->Slice(1); - ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_OK(builder_->Finish(&array)); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); - slice = array->Slice(1, 4); - slice2 = array->Slice(1, 4); - ASSERT_EQ(4, slice->length()); + ASSERT_OK(builder_->Finish(&equal_array)); + // now an unequal one + ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size())); + ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); - ASSERT_TRUE(slice->Equals(slice2)); - ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice)); -} + ASSERT_OK(builder_->Finish(&unequal_array)); -TEST_F(TestListArray, ValuesEquality) { - auto type = list(int32()); - auto left = ArrayFromJSON(type, "[[1, 2], [3], [0]]"); - auto right = ArrayFromJSON(type, "[[1, 2], [3], [100000]]"); - auto offset = 2; - EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset))); -} + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); -TEST_F(TestListArray, TestResize) {} + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); + EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array)); -TEST_F(TestListArray, TestFromArrays) { - std::shared_ptr offsets1, offsets2, offsets3, offsets4, values; + // Check with slices, ARROW-33 + std::shared_ptr slice, slice2; - std::vector offsets_is_valid3 = {true, false, true, true}; - std::vector offsets_is_valid4 = {true, true, false, true}; + slice = array->Slice(2); + slice2 = array->Slice(2); + ASSERT_EQ(array->length() - 2, slice->length()); - std::vector values_is_valid = {true, false, true, true, true, true}; + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice)); - std::vector offset1_values = {0, 2, 2, 6}; - std::vector offset2_values = {0, 2, 6, 6}; + // Chained slices + slice2 = array->Slice(1)->Slice(1); + ASSERT_TRUE(slice->Equals(slice2)); - std::vector values_values = {0, 1, 2, 3, 4, 5}; - const int length = 3; + slice = array->Slice(1, 4); + slice2 = array->Slice(1, 4); + ASSERT_EQ(4, slice->length()); - ArrayFromVector(offset1_values, &offsets1); - ArrayFromVector(offset2_values, &offsets2); + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice)); + } - ArrayFromVector(offsets_is_valid3, offset1_values, &offsets3); - ArrayFromVector(offsets_is_valid4, offset2_values, &offsets4); + void TestValuesEquality() { + auto type = std::make_shared(int32()); + auto left = ArrayFromJSON(type, "[[1, 2], [3], [0]]"); + auto right = ArrayFromJSON(type, "[[1, 2], [3], [100000]]"); + auto offset = 2; + EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset))); + } - ArrayFromVector(values_is_valid, values_values, &values); + void TestFromArrays() { + std::shared_ptr offsets1, offsets2, offsets3, offsets4, values; - auto list_type = list(int8()); + std::vector offsets_is_valid3 = {true, false, true, true}; + std::vector offsets_is_valid4 = {true, true, false, true}; - std::shared_ptr list1, list3, list4; - ASSERT_OK(ListArray::FromArrays(*offsets1, *values, pool_, &list1)); - ASSERT_OK(ListArray::FromArrays(*offsets3, *values, pool_, &list3)); - ASSERT_OK(ListArray::FromArrays(*offsets4, *values, pool_, &list4)); + std::vector values_is_valid = {true, false, true, true, true, true}; - ListArray expected1(list_type, length, offsets1->data()->buffers[1], values, - offsets1->data()->buffers[0], 0); - AssertArraysEqual(expected1, *list1); + std::vector offset1_values = {0, 2, 2, 6}; + std::vector offset2_values = {0, 2, 6, 6}; - // Use null bitmap from offsets3, but clean offsets from non-null version - ListArray expected3(list_type, length, offsets1->data()->buffers[1], values, - offsets3->data()->buffers[0], 1); - AssertArraysEqual(expected3, *list3); + std::vector values_values = {0, 1, 2, 3, 4, 5}; + const int length = 3; - // Check that the last offset bit is zero - ASSERT_FALSE(BitUtil::GetBit(list3->null_bitmap()->data(), length + 1)); + ArrayFromVector(offset1_values, &offsets1); + ArrayFromVector(offset2_values, &offsets2); - ListArray expected4(list_type, length, offsets2->data()->buffers[1], values, - offsets4->data()->buffers[0], 1); - AssertArraysEqual(expected4, *list4); + ArrayFromVector(offsets_is_valid3, offset1_values, + &offsets3); + ArrayFromVector(offsets_is_valid4, offset2_values, + &offsets4); - // Test failure modes + ArrayFromVector(values_is_valid, values_values, &values); - std::shared_ptr tmp; + auto list_type = std::make_shared(int8()); - // Zero-length offsets - ASSERT_RAISES(Invalid, - ListArray::FromArrays(*offsets1->Slice(0, 0), *values, pool_, &tmp)); + std::shared_ptr list1, list3, list4; + ASSERT_OK(ArrayType::FromArrays(*offsets1, *values, pool_, &list1)); + ASSERT_OK(ArrayType::FromArrays(*offsets3, *values, pool_, &list3)); + ASSERT_OK(ArrayType::FromArrays(*offsets4, *values, pool_, &list4)); + ASSERT_OK(ValidateArray(*list1)); + ASSERT_OK(ValidateArray(*list3)); + ASSERT_OK(ValidateArray(*list4)); - // Offsets not int32 - ASSERT_RAISES(TypeError, ListArray::FromArrays(*values, *offsets1, pool_, &tmp)); -} + ArrayType expected1(list_type, length, offsets1->data()->buffers[1], values, + offsets1->data()->buffers[0], 0); + AssertArraysEqual(expected1, *list1); -TEST_F(TestListArray, TestAppendNull) { - ASSERT_OK(builder_->AppendNull()); - ASSERT_OK(builder_->AppendNull()); + // Use null bitmap from offsets3, but clean offsets from non-null version + ArrayType expected3(list_type, length, offsets1->data()->buffers[1], values, + offsets3->data()->buffers[0], 1); + AssertArraysEqual(expected3, *list3); - Done(); + // Check that the last offset bit is zero + ASSERT_FALSE(BitUtil::GetBit(list3->null_bitmap()->data(), length + 1)); - ASSERT_OK(ValidateArray(*result_)); - ASSERT_TRUE(result_->IsNull(0)); - ASSERT_TRUE(result_->IsNull(1)); + ArrayType expected4(list_type, length, offsets2->data()->buffers[1], values, + offsets4->data()->buffers[0], 1); + AssertArraysEqual(expected4, *list4); - ASSERT_EQ(0, result_->raw_value_offsets()[0]); - ASSERT_EQ(0, result_->value_offset(1)); - ASSERT_EQ(0, result_->value_offset(2)); + // Test failure modes - auto values = result_->values(); - ASSERT_EQ(0, values->length()); - // Values buffer should be non-null - ASSERT_NE(nullptr, values->data()->buffers[1]); -} + std::shared_ptr tmp; -TEST_F(TestListArray, TestAppendNulls) { - ASSERT_OK(builder_->AppendNulls(3)); + // Zero-length offsets + ASSERT_RAISES(Invalid, + ArrayType::FromArrays(*offsets1->Slice(0, 0), *values, pool_, &tmp)); - Done(); + // Offsets not the right type + ASSERT_RAISES(TypeError, ArrayType::FromArrays(*values, *offsets1, pool_, &tmp)); + } - ASSERT_OK(ValidateArray(*result_)); - ASSERT_EQ(result_->length(), 3); - ASSERT_EQ(result_->null_count(), 3); - ASSERT_TRUE(result_->IsNull(0)); - ASSERT_TRUE(result_->IsNull(1)); - ASSERT_TRUE(result_->IsNull(2)); + void TestAppendNull() { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); - ASSERT_EQ(0, result_->raw_value_offsets()[0]); - ASSERT_EQ(0, result_->value_offset(1)); - ASSERT_EQ(0, result_->value_offset(2)); - ASSERT_EQ(0, result_->value_offset(3)); + Done(); - auto values = result_->values(); - ASSERT_EQ(0, values->length()); - // Values buffer should be non-null - ASSERT_NE(nullptr, values->data()->buffers[1]); -} + ASSERT_OK(ValidateArray(*result_)); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); -void ValidateBasicListArray(const ListArray* result, const std::vector& values, - const std::vector& is_valid) { - ASSERT_OK(ValidateArray(*result)); - ASSERT_EQ(1, result->null_count()); - ASSERT_EQ(0, result->values()->null_count()); + ASSERT_EQ(0, result_->raw_value_offsets()[0]); + ASSERT_EQ(0, result_->value_offset(1)); + ASSERT_EQ(0, result_->value_offset(2)); - ASSERT_EQ(3, result->length()); - std::vector ex_offsets = {0, 3, 3, 7}; - for (size_t i = 0; i < ex_offsets.size(); ++i) { - ASSERT_EQ(ex_offsets[i], result->value_offset(i)); + auto values = result_->values(); + ASSERT_EQ(0, values->length()); + // Values buffer should be non-null + ASSERT_NE(nullptr, values->data()->buffers[1]); } - for (int i = 0; i < result->length(); ++i) { - ASSERT_EQ(is_valid[i] == 0, result->IsNull(i)); - } + void TestAppendNulls() { + ASSERT_OK(builder_->AppendNulls(3)); - ASSERT_EQ(7, result->values()->length()); - auto varr = std::dynamic_pointer_cast(result->values()); + Done(); - for (size_t i = 0; i < values.size(); ++i) { - ASSERT_EQ(values[i], varr->Value(i)); - } -} + ASSERT_OK(ValidateArray(*result_)); + ASSERT_EQ(result_->length(), 3); + ASSERT_EQ(result_->null_count(), 3); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + ASSERT_TRUE(result_->IsNull(2)); -TEST_F(TestListArray, TestBasics) { - std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::vector lengths = {3, 0, 4}; - std::vector is_valid = {1, 0, 1}; + ASSERT_EQ(0, result_->raw_value_offsets()[0]); + ASSERT_EQ(0, result_->value_offset(1)); + ASSERT_EQ(0, result_->value_offset(2)); + ASSERT_EQ(0, result_->value_offset(3)); - Int32Builder* vb = checked_cast(builder_->value_builder()); + auto values = result_->values(); + ASSERT_EQ(0, values->length()); + // Values buffer should be non-null + ASSERT_NE(nullptr, values->data()->buffers[1]); + } - ASSERT_OK(builder_->Reserve(lengths.size())); - ASSERT_OK(vb->Reserve(values.size())); + void TestBulkAppend() { + std::vector values = {0, 1, 2, 3, 4, 5, 6}; + std::vector is_valid = {1, 0, 1}; + std::vector offsets = {0, 3, 3}; - int pos = 0; - for (size_t i = 0; i < lengths.size(); ++i) { - ASSERT_OK(builder_->Append(is_valid[i] > 0)); - for (int j = 0; j < lengths[i]; ++j) { - ASSERT_OK(vb->Append(values[pos++])); + Int16Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Reserve(values.size())); + + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + for (int16_t value : values) { + ASSERT_OK(vb->Append(value)); } + Done(); + ValidateBasicListArray(result_.get(), values, is_valid); } - Done(); - ValidateBasicListArray(result_.get(), values, is_valid); -} + void TestBulkAppendInvalid() { + std::vector values = {0, 1, 2, 3, 4, 5, 6}; + std::vector lengths = {3, 0, 4}; + std::vector is_valid = {1, 0, 1}; + // Should be {0, 3, 3} given the is_valid array + std::vector offsets = {0, 2, 4}; -TEST_F(TestListArray, BulkAppend) { - std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::vector lengths = {3, 0, 4}; - std::vector is_valid = {1, 0, 1}; - std::vector offsets = {0, 3, 3}; + Int16Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Reserve(values.size())); - Int32Builder* vb = checked_cast(builder_->value_builder()); - ASSERT_OK(vb->Reserve(values.size())); + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + for (int16_t value : values) { + ASSERT_OK(vb->Append(value)); + } - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); - for (int32_t value : values) { - ASSERT_OK(vb->Append(value)); + Done(); + ASSERT_RAISES(Invalid, ValidateArray(*result_)); } - Done(); - ValidateBasicListArray(result_.get(), values, is_valid); -} -TEST_F(TestListArray, BulkAppendInvalid) { - std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::vector lengths = {3, 0, 4}; - std::vector is_null = {0, 1, 0}; - std::vector is_valid = {1, 0, 1}; - std::vector offsets = {0, 2, 4}; // should be 0, 3, 3 given the is_null array + void TestZeroLength() { + // All buffers are null + Done(); + ASSERT_OK(ValidateArray(*result_)); + } - Int32Builder* vb = checked_cast(builder_->value_builder()); - ASSERT_OK(vb->Reserve(values.size())); + void TestBuilderPreserveFieldName() { + auto list_type_with_name = std::make_shared(field("counts", int16())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); - for (int32_t value : values) { - ASSERT_OK(vb->Append(value)); + std::unique_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, list_type_with_name, &tmp)); + builder_.reset(checked_cast(tmp.release())); + + std::vector offsets = {1, 2, 4, 8}; + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + + std::shared_ptr list_array; + ASSERT_OK(builder_->Finish(&list_array)); + + const auto& type = checked_cast(*list_array->type()); + ASSERT_EQ("counts", type.value_field()->name()); } - Done(); - ASSERT_RAISES(Invalid, ValidateArray(*result_)); -} + protected: + std::shared_ptr value_type_; -TEST_F(TestListArray, TestZeroLength) { - // All buffers are null - Done(); - ASSERT_OK(ValidateArray(*result_)); -} + std::shared_ptr builder_; + std::shared_ptr result_; +}; -TEST_F(TestListArray, TestBuilderPreserveFieleName) { - auto list_type_with_name = list(field("counts", int32())); +TYPED_TEST_CASE(TestListArray, ListTypes); - std::unique_ptr tmp; - ASSERT_OK(MakeBuilder(pool_, list_type_with_name, &tmp)); - builder_.reset(checked_cast(tmp.release())); +TYPED_TEST(TestListArray, Basics) { this->TestBasics(); } - std::vector values = {1, 2, 4, 8}; - ASSERT_OK(builder_->AppendValues(values.data(), values.size())); +TYPED_TEST(TestListArray, Equality) { this->TestEquality(); } - std::shared_ptr list_array; - ASSERT_OK(builder_->Finish(&list_array)); +TYPED_TEST(TestListArray, ValuesEquality) { this->TestValuesEquality(); } - const auto& type = checked_cast(*list_array->type()); - ASSERT_EQ("counts", type.value_field()->name()); +TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } + +TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } + +TYPED_TEST(TestListArray, AppendNulls) { this->TestAppendNulls(); } + +TYPED_TEST(TestListArray, BulkAppend) { this->TestBulkAppend(); } + +TYPED_TEST(TestListArray, BulkAppendInvalid) { this->TestBulkAppendInvalid(); } + +TYPED_TEST(TestListArray, ZeroLength) { this->TestZeroLength(); } + +TYPED_TEST(TestListArray, BuilderPreserveFieldName) { + this->TestBuilderPreserveFieldName(); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 0b7d8f170cb..01f0ddb1668 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "arrow/buffer.h" @@ -199,34 +200,29 @@ BooleanArray::BooleanArray(int64_t length, const std::shared_ptr& data, : PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {} // ---------------------------------------------------------------------- -// ListArray +// ListArray / LargeListArray -ListArray::ListArray(const std::shared_ptr& data) { SetData(data); } +namespace { -ListArray::ListArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, - const std::shared_ptr& values, - const std::shared_ptr& null_bitmap, int64_t null_count, - int64_t offset) { - auto internal_data = - ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); - internal_data->child_data.emplace_back(values->data()); - SetData(internal_data); -} +template +Status ListArrayFromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr* out) { + using offset_type = typename TYPE::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + using OffsetArrayType = typename TypeTraits::ArrayType; -Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, - std::shared_ptr* out) { if (offsets.length() == 0) { return Status::Invalid("List offsets must have non-zero length"); } - if (offsets.type_id() != Type::INT32) { - return Status::TypeError("List offsets must be signed int32"); + if (offsets.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); } BufferVector buffers = {}; - const auto& typed_offsets = checked_cast(offsets); + const auto& typed_offsets = checked_cast(offsets); const int64_t num_offsets = offsets.length(); @@ -236,7 +232,8 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo } std::shared_ptr clean_offsets, clean_valid_bits; - RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets)); + RETURN_NOT_OK( + AllocateBuffer(pool, num_offsets * sizeof(offset_type), &clean_offsets)); // Copy valid bits, zero out the bit for the final offset // XXX why? @@ -245,11 +242,12 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets); buffers.emplace_back(std::move(clean_valid_bits)); - const int32_t* raw_offsets = typed_offsets.raw_values(); - auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); + const offset_type* raw_offsets = typed_offsets.raw_values(); + auto clean_raw_offsets = + reinterpret_cast(clean_offsets->mutable_data()); // Must work backwards so we can tell how many values were in the last non-null value - int32_t current_offset = raw_offsets[num_offsets - 1]; + offset_type current_offset = raw_offsets[num_offsets - 1]; for (int64_t i = num_offsets - 1; i >= 0; --i) { if (offsets.IsValid(i)) { current_offset = raw_offsets[i]; @@ -263,25 +261,55 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo buffers.emplace_back(typed_offsets.values()); } - auto list_type = list(values.type()); + auto list_type = std::make_shared(values.type()); auto internal_data = ArrayData::Make(list_type, num_offsets - 1, std::move(buffers), offsets.null_count(), offsets.offset()); internal_data->child_data.push_back(values.data()); - *out = std::make_shared(internal_data); + *out = std::make_shared(internal_data); return Status::OK(); } +} // namespace + +ListArray::ListArray(const std::shared_ptr& data) { SetData(data); } + +LargeListArray::LargeListArray(const std::shared_ptr& data) { SetData(data); } + +ListArray::ListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap, int64_t null_count, + int64_t offset) { + ARROW_CHECK_EQ(type->id(), Type::LIST); + auto internal_data = + ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); + internal_data->child_data.emplace_back(values->data()); + SetData(internal_data); +} + +LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset) { + ARROW_CHECK_EQ(type->id(), Type::LARGE_LIST); + auto internal_data = + ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); + internal_data->child_data.emplace_back(values->data()); + SetData(internal_data); +} + void ListArray::SetData(const std::shared_ptr& data) { this->Array::SetData(data); ARROW_CHECK_EQ(data->buffers.size(), 2); - ARROW_CHECK(data->type->id() == Type::LIST); + ARROW_CHECK_EQ(data->type->id(), Type::LIST); list_type_ = checked_cast(data->type.get()); auto value_offsets = data->buffers[1]; raw_value_offsets_ = value_offsets == nullptr ? nullptr - : reinterpret_cast(value_offsets->data()); + : reinterpret_cast(value_offsets->data()); ARROW_CHECK_EQ(data_->child_data.size(), 1); ARROW_CHECK_EQ(list_type_->value_type()->id(), data->child_data[0]->type->id()); @@ -289,11 +317,32 @@ void ListArray::SetData(const std::shared_ptr& data) { values_ = MakeArray(data_->child_data[0]); } -std::shared_ptr ListArray::value_type() const { - return list_type()->value_type(); +void LargeListArray::SetData(const std::shared_ptr& data) { + this->Array::SetData(data); + ARROW_CHECK_EQ(data->buffers.size(), 2); + ARROW_CHECK_EQ(data->type->id(), Type::LARGE_LIST); + list_type_ = checked_cast(data->type.get()); + + auto value_offsets = data->buffers[1]; + raw_value_offsets_ = value_offsets == nullptr + ? nullptr + : reinterpret_cast(value_offsets->data()); + + ARROW_CHECK_EQ(data_->child_data.size(), 1); + ARROW_CHECK_EQ(list_type_->value_type()->id(), data->child_data[0]->type->id()); + DCHECK(list_type_->value_type()->Equals(data->child_data[0]->type)); + values_ = MakeArray(data_->child_data[0]); } -std::shared_ptr ListArray::values() const { return values_; } +Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr* out) { + return ListArrayFromArrays(offsets, values, pool, out); +} + +Status LargeListArray::FromArrays(const Array& offsets, const Array& values, + MemoryPool* pool, std::shared_ptr* out) { + return ListArrayFromArrays(offsets, values, pool, out); +} // ---------------------------------------------------------------------- // MapArray @@ -1167,21 +1216,12 @@ struct ValidateVisitor { } Status Visit(const ListArray& array) { - if (!array.values()) { - return Status::Invalid("values was null"); - } - - const int32_t last_offset = array.value_offset(array.length()); - if (array.values()->length() != last_offset) { - return Status::Invalid("Final offset invariant not equal to values length: ", - last_offset, "!=", array.values()->length()); - } - - const Status child_valid = ValidateArray(*array.values()); - if (!child_valid.ok()) { - return Status::Invalid("Child array invalid: ", child_valid.ToString()); - } + RETURN_NOT_OK(ValidateListArray(array)); + return ValidateOffsets(array); + } + Status Visit(const LargeListArray& array) { + RETURN_NOT_OK(ValidateListArray(array)); return ValidateOffsets(array); } @@ -1280,6 +1320,26 @@ struct ValidateVisitor { } protected: + template + Status ValidateListArray(const ListArrayType& array) { + if (!array.values()) { + return Status::Invalid("values was null"); + } + + const auto last_offset = array.value_offset(array.length()); + if (array.values()->length() != last_offset) { + return Status::Invalid("Final offset invariant not equal to values length: ", + last_offset, "!=", array.values()->length()); + } + + const Status child_valid = ValidateArray(*array.values()); + if (!child_valid.ok()) { + return Status::Invalid("Child array invalid: ", child_valid.ToString()); + } + + return ValidateOffsets(array); + } + template Status ValidateOffsets(ArrayType& array) { using offset_type = typename ArrayType::offset_type; diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index e13088c65c7..2313994ad20 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -488,12 +488,49 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { // ---------------------------------------------------------------------- // ListArray -/// Concrete Array class for list data -class ARROW_EXPORT ListArray : public Array { +/// Base class for variable-sized list arrays, regardless of offset size. +template +class BaseListArray : public Array { public: - using TypeClass = ListType; - using offset_type = ListType::offset_type; + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + + const TypeClass* list_type() const { return list_type_; } + + /// \brief Return array object containing the list's values + std::shared_ptr values() const { return values_; } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return data_->buffers[1]; } + std::shared_ptr value_type() const { return list_type_->value_type(); } + + /// Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_offsets() const { + return raw_value_offsets_ + data_->offset; + } + + // The following functions will not perform boundschecking + offset_type value_offset(int64_t i) const { + return raw_value_offsets_[i + data_->offset]; + } + offset_type value_length(int64_t i) const { + i += data_->offset; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } + std::shared_ptr value_slice(int64_t i) const { + return values_->Slice(value_offset(i), value_length(i)); + } + + protected: + const TypeClass* list_type_ = NULLPTR; + std::shared_ptr values_; + const offset_type* raw_value_offsets_ = NULLPTR; +}; + +/// Concrete Array class for list data +class ARROW_EXPORT ListArray : public BaseListArray { + public: explicit ListArray(const std::shared_ptr& data); ListArray(const std::shared_ptr& type, int64_t length, @@ -511,46 +548,48 @@ class ARROW_EXPORT ListArray : public Array { /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type - /// \param[in] values Array containing + /// \param[in] values Array containing list values /// \param[in] pool MemoryPool in case new offsets array needs to be /// allocated because of null values /// \param[out] out Will have length equal to offsets.length() - 1 static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr* out); - const ListType* list_type() const { return list_type_; } - - /// \brief Return array object containing the list's values - std::shared_ptr values() const; - - /// Note that this buffer does not account for any slice offset - std::shared_ptr value_offsets() const { return data_->buffers[1]; } - - std::shared_ptr value_type() const; - - /// Return pointer to raw value offsets accounting for any slice offset - const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - - // The following functions will not perform boundschecking - int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - int32_t value_length(int64_t i) const { - i += data_->offset; - return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; - } - std::shared_ptr value_slice(int64_t i) const { - return values_->Slice(value_offset(i), value_length(i)); - } - protected: // This constructor defers SetData to a derived array class ListArray() = default; void SetData(const std::shared_ptr& data); +}; - const int32_t* raw_value_offsets_ = NULLPTR; +/// Concrete Array class for large list data (with 64-bit offsets) +class ARROW_EXPORT LargeListArray : public BaseListArray { + public: + explicit LargeListArray(const std::shared_ptr& data); - private: - const ListType* list_type_ = NULLPTR; - std::shared_ptr values_; + LargeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListArray from array of offsets and child value array + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int64 type + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[out] out Will have length equal to offsets.length() - 1 + static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr* out); + + protected: + void SetData(const std::shared_ptr& data); }; // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 30b3fc05e59..809cf962a0d 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -24,7 +24,6 @@ #include #include -#include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/status.h" #include "arrow/type.h" @@ -35,115 +34,6 @@ namespace arrow { -// ---------------------------------------------------------------------- -// ListBuilder - -ListBuilder::ListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - const std::shared_ptr& type) - : ArrayBuilder(type ? type - : std::static_pointer_cast( - std::make_shared(value_builder->type())), - pool), - offsets_builder_(pool), - value_builder_(value_builder) {} - -Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); -} - -Status ListBuilder::CheckNextOffset() const { - const int64_t num_values = value_builder_->length(); - ARROW_RETURN_IF( - num_values > kListMaximumElements, - Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,", - " have ", num_values)); - return Status::OK(); -} - -Status ListBuilder::AppendNextOffset() { - RETURN_NOT_OK(CheckNextOffset()); - const int64_t num_values = value_builder_->length(); - return offsets_builder_.Append(static_cast(num_values)); -} - -Status ListBuilder::Append(bool is_valid) { - RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return AppendNextOffset(); -} - -Status ListBuilder::AppendNulls(int64_t length) { - RETURN_NOT_OK(Reserve(length)); - RETURN_NOT_OK(CheckNextOffset()); - UnsafeAppendToBitmap(length, false); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } - return Status::OK(); -} - -Status ListBuilder::Resize(int64_t capacity) { - if (capacity > kListMaximumElements) { - return Status::CapacityError( - "ListArray cannot reserve space for more then 2^31 - 1 child elements, got ", - capacity); - } - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); - return ArrayBuilder::Resize(capacity); -} - -Status ListBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - - std::shared_ptr items; - if (values_) { - items = values_->data(); - } else { - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - RETURN_NOT_OK(value_builder_->Resize(0)); - } - RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - } - - // If the type has not been specified in the constructor, infer it - // This is the case if the value_builder contains a DenseUnionBuilder - if (!arrow::internal::checked_cast(*type_).value_type()) { - type_ = std::static_pointer_cast( - std::make_shared(value_builder_->type())); - } - std::shared_ptr null_bitmap; - RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_); - (*out)->child_data.emplace_back(std::move(items)); - Reset(); - return Status::OK(); -} - -void ListBuilder::Reset() { - ArrayBuilder::Reset(); - values_.reset(); - offsets_builder_.Reset(); - value_builder_->Reset(); -} - -ArrayBuilder* ListBuilder::value_builder() const { - DCHECK(!values_) << "Using value builder is pointless when values_ is set"; - return value_builder_.get(); -} // ---------------------------------------------------------------------- // MapBuilder diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 8742f2b6e24..9b5b4de162f 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -17,9 +17,12 @@ #pragma once +#include #include +#include #include +#include "arrow/array.h" #include "arrow/array/builder_base.h" #include "arrow/buffer-builder.h" @@ -28,63 +31,174 @@ namespace arrow { // ---------------------------------------------------------------------- // List builder -/// \class ListBuilder -/// \brief Builder class for variable-length list array value types -/// -/// To use this class, you must append values to the child array builder and use -/// the Append function to delimit each distinct list value (once the values -/// have been appended to the child array) or use the bulk API to append -/// a sequence of offests and null values. -/// -/// A note on types. Per arrow/type.h all types in the c++ implementation are -/// logical so even though this class always builds list array, this can -/// represent multiple different logical types. If no logical type is provided -/// at construction time, the class defaults to List where t is taken from the -/// value_builder/values that the object is constructed with. -class ARROW_EXPORT ListBuilder : public ArrayBuilder { +template +class BaseListBuilder : public ArrayBuilder { public: + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; + /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. - ListBuilder(MemoryPool* pool, const std::shared_ptr& value_builder, - const std::shared_ptr& type = NULLPTR); - - Status Resize(int64_t capacity) override; - void Reset() override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond + BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + const std::shared_ptr& type = NULLPTR) + : ArrayBuilder(type ? type + : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), + offsets_builder_(pool), + value_builder_(value_builder) {} + + Status Resize(int64_t capacity) override { + if (capacity > maximum_elements()) { + return Status::CapacityError("List array cannot reserve space for more than ", + maximum_elements(), " got ", capacity); + } + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // one more then requested for offsets + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + return ArrayBuilder::Resize(capacity); + } - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + void Reset() override { + ArrayBuilder::Reset(); + values_.reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); + } /// \brief Vector append /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot - Status AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR); + Status AppendValues(const offset_type* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); + } /// \brief Start a new variable-length list slot /// /// This function should be called before beginning to append elements to the /// value builder - Status Append(bool is_valid = true); + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return AppendNextOffset(); + } Status AppendNull() final { return Append(false); } - Status AppendNulls(int64_t length) final; + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(CheckNextOffset()); + UnsafeAppendToBitmap(length, false); + const int64_t num_values = value_builder_->length(); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_values)); + } + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + + std::shared_ptr items; + if (values_) { + items = values_->data(); + } else { + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); + } + ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + } + + // If the type has not been specified in the constructor, infer it + // This is the case if the value_builder contains a DenseUnionBuilder + if (!arrow::internal::checked_cast(*type_).value_type()) { + type_ = std::static_pointer_cast( + std::make_shared(value_builder_->type())); + } + std::shared_ptr null_bitmap; + ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_); + (*out)->child_data.emplace_back(std::move(items)); + Reset(); + return Status::OK(); + } - ArrayBuilder* value_builder() const; + ArrayBuilder* value_builder() const { return value_builder_.get(); } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } protected: - TypedBufferBuilder offsets_builder_; + TypedBufferBuilder offsets_builder_; std::shared_ptr value_builder_; std::shared_ptr values_; - Status CheckNextOffset() const; - Status AppendNextOffset(); - Status AppendNextOffset(int64_t num_repeats); + Status CheckNextOffset() const { + const int64_t num_values = value_builder_->length(); + ARROW_RETURN_IF( + num_values > maximum_elements(), + Status::CapacityError("List array cannot contain more than ", maximum_elements(), + " child elements,", " have ", num_values)); + return Status::OK(); + } + + Status AppendNextOffset() { + ARROW_RETURN_NOT_OK(CheckNextOffset()); + const int64_t num_values = value_builder_->length(); + return offsets_builder_.Append(static_cast(num_values)); + } +}; + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offests and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public BaseListBuilder { + public: + using BaseListBuilder::BaseListBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +/// \class LargeListBuilder +/// \brief Builder class for large variable-length list array value types +/// +/// Like ListBuilder, but to create large list arrays (with 64-bit offsets). +class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { + public: + using BaseListBuilder::BaseListBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } }; // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array/concatenate-test.cc b/cpp/src/arrow/array/concatenate-test.cc index 730b25ab822..4d16b173358 100644 --- a/cpp/src/arrow/array/concatenate-test.cc +++ b/cpp/src/arrow/array/concatenate-test.cc @@ -165,11 +165,28 @@ TEST_F(ConcatenateTest, ListType) { auto values_size = size * 4; auto values = this->GeneratePrimitive(values_size, null_probability); auto offsets_vector = this->Offsets(values_size, size); - // ensure the first offset is 0, which is expected for ListType - offsets_vector[0] = 0; + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; + offsets_vector.back() = static_cast(values_size); std::shared_ptr offsets; ArrayFromVector(offsets_vector, &offsets); ASSERT_OK(ListArray::FromArrays(*offsets, *values, default_memory_pool(), out)); + ASSERT_OK(ValidateArray(**out)); + }); +} + +TEST_F(ConcatenateTest, LargeListType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto values_size = size * 4; + auto values = this->GeneratePrimitive(values_size, null_probability); + auto offsets_vector = this->Offsets(values_size, size); + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; + offsets_vector.back() = static_cast(values_size); + std::shared_ptr offsets; + ArrayFromVector(offsets_vector, &offsets); + ASSERT_OK(LargeListArray::FromArrays(*offsets, *values, default_memory_pool(), out)); + ASSERT_OK(ValidateArray(**out)); }); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index a20b157acd5..4428e4beaab 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -204,6 +204,14 @@ class ConcatenateImpl { .Concatenate(out_.child_data[0].get()); } + Status Visit(const LargeListType&) { + std::vector value_ranges; + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, sizeof(int64_t)), pool_, + &out_.buffers[1], &value_ranges)); + return ConcatenateImpl(ChildData(0, value_ranges), pool_) + .Concatenate(out_.child_data[0].get()); + } + Status Visit(const FixedSizeListType&) { return ConcatenateImpl(ChildData(0), pool_).Concatenate(out_.child_data[0].get()); } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 44b0d041be9..b13ce200f26 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -136,6 +136,14 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, out->reset(new ListBuilder(pool, std::move(value_builder), type)); return Status::OK(); } + case Type::LARGE_LIST: { + std::unique_ptr value_builder; + std::shared_ptr value_type = + internal::checked_cast(*type).value_type(); + RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); + out->reset(new LargeListBuilder(pool, std::move(value_builder), type)); + return Status::OK(); + } case Type::MAP: { const auto& map_type = internal::checked_cast(*type); std::unique_ptr key_builder, item_builder; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index cb606e30b9b..ff4c2b5a1eb 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -36,6 +36,7 @@ #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -174,8 +175,9 @@ class RangeEqualsVisitor { return true; } - bool CompareLists(const ListArray& left) { - const auto& right = checked_cast(right_); + template + bool CompareLists(const ListArrayType& left) { + const auto& right = checked_cast(right_); const std::shared_ptr& left_values = left.values(); const std::shared_ptr& right_values = right.values(); @@ -187,10 +189,10 @@ class RangeEqualsVisitor { return false; } if (is_null) continue; - const int32_t begin_offset = left.value_offset(i); - const int32_t end_offset = left.value_offset(i + 1); - const int32_t right_begin_offset = right.value_offset(o_i); - const int32_t right_end_offset = right.value_offset(o_i + 1); + const auto begin_offset = left.value_offset(i); + const auto end_offset = left.value_offset(i + 1); + const auto right_begin_offset = right.value_offset(o_i); + const auto right_end_offset = right.value_offset(o_i + 1); // Underlying can't be equal if the size isn't equal if (end_offset - begin_offset != right_end_offset - right_begin_offset) { return false; @@ -339,6 +341,11 @@ class RangeEqualsVisitor { return Status::OK(); } + Status Visit(const LargeListArray& left) { + result_ = CompareLists(left); + return Status::OK(); + } + Status Visit(const FixedSizeListArray& left) { const auto& right = checked_cast(right_); result_ = left.values()->RangeEquals( @@ -569,6 +576,20 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { } } + template + bool CompareList(const ListArrayType& left) { + const auto& right = checked_cast(right_); + + bool equal_offsets = ValueOffsetsEqual(left); + if (!equal_offsets) { + return false; + } + + return left.values()->RangeEquals(left.value_offset(0), + left.value_offset(left.length()), + right.value_offset(0), right.values()); + } + Status Visit(const BinaryArray& left) { result_ = CompareBinary(left); return Status::OK(); @@ -580,16 +601,12 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { } Status Visit(const ListArray& left) { - const auto& right = checked_cast(right_); - bool equal_offsets = ValueOffsetsEqual(left); - if (!equal_offsets) { - result_ = false; - return Status::OK(); - } + result_ = CompareList(left); + return Status::OK(); + } - result_ = - left.values()->RangeEquals(left.value_offset(0), left.value_offset(left.length()), - right.value_offset(0), right.values()); + Status Visit(const LargeListArray& left) { + result_ = CompareList(left); return Status::OK(); } @@ -760,6 +777,8 @@ class TypeEqualsVisitor { Status Visit(const ListType& left) { return VisitChildren(left); } + Status Visit(const LargeListType& left) { return VisitChildren(left); } + Status Visit(const MapType& left) { const auto& right = checked_cast(right_); if (left.keys_sorted() != right.keys_sorted()) { @@ -858,6 +877,12 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const LargeListScalar& left) { + const auto& right = checked_cast(right_); + result_ = internal::SharedPtrEquals(left.value, right.value); + return Status::OK(); + } + Status Visit(const MapScalar& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.keys, right.keys) && diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 80538f20e4f..61f1500309d 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -1147,6 +1147,34 @@ TEST_F(TestCast, ListToList) { CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); } +TEST_F(TestCast, LargeListToLargeList) { + // Like ListToList above, only testing the basics + CastOptions options; + std::shared_ptr offsets; + + std::vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; + std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; + ArrayFromVector(offsets_is_valid, offsets_values, &offsets); + + std::shared_ptr int32_plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + std::shared_ptr int32_list_array; + ASSERT_OK( + LargeListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); + + std::shared_ptr float64_plain_array; + ASSERT_OK( + Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); + std::shared_ptr float64_list_array; + ASSERT_OK(LargeListArray::FromArrays(*offsets, *float64_plain_array, pool_, + &float64_list_array)); + + CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options); + + options.allow_float_truncate = true; + CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options); +} + TEST_F(TestCast, IdentityCasts) { // ARROW-4102 auto CheckIdentityCast = [this](std::shared_ptr type, diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index a8b66159997..839f9a2e9d5 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -645,6 +645,7 @@ Status InvokeWithAllocation(FunctionContext* ctx, UnaryKernel* func, const Datum return Status::OK(); } +template class ListCastKernel : public CastKernelBase { public: ListCastKernel(std::unique_ptr child_caster, @@ -655,7 +656,7 @@ class ListCastKernel : public CastKernelBase { DCHECK_EQ(Datum::ARRAY, input.kind()); const ArrayData& in_data = *input.array(); - DCHECK_EQ(Type::LIST, in_data.type->id()); + DCHECK_EQ(TypeClass::type_id, in_data.type->id()); ArrayData* result; if (in_data.offset != 0) { @@ -1160,19 +1161,20 @@ GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType) namespace { +template Status GetListCastFunc(const DataType& in_type, std::shared_ptr out_type, const CastOptions& options, std::unique_ptr* kernel) { - if (out_type->id() != Type::LIST) { + if (out_type->id() != TypeClass::type_id) { // Kernel will be null return Status::OK(); } - const DataType& in_value_type = *checked_cast(in_type).value_type(); + const DataType& in_value_type = *checked_cast(in_type).value_type(); std::shared_ptr out_value_type = - checked_cast(*out_type).value_type(); + checked_cast(*out_type).value_type(); std::unique_ptr child_caster; RETURN_NOT_OK(GetCastFunction(in_value_type, out_value_type, options, &child_caster)); *kernel = std::unique_ptr( - new ListCastKernel(std::move(child_caster), std::move(out_type))); + new ListCastKernel(std::move(child_caster), std::move(out_type))); return Status::OK(); } @@ -1238,7 +1240,12 @@ Status GetCastFunction(const DataType& in_type, std::shared_ptr out_ty CAST_FUNCTION_CASE(LargeStringType); CAST_FUNCTION_CASE(DictionaryType); case Type::LIST: - RETURN_NOT_OK(GetListCastFunc(in_type, std::move(out_type), options, kernel)); + RETURN_NOT_OK( + GetListCastFunc(in_type, std::move(out_type), options, kernel)); + break; + case Type::LARGE_LIST: + RETURN_NOT_OK( + GetListCastFunc(in_type, std::move(out_type), options, kernel)); break; default: break; diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index 253093e3799..45fd9e5d070 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -358,6 +358,15 @@ TEST_F(TestFilterKernelWithList, FilterListListInt32) { ])"); } +class TestFilterKernelWithLargeList : public TestFilterKernel {}; + +TEST_F(TestFilterKernelWithLargeList, FilterListInt32) { + std::string list_json = "[[], [1,2], null, [3]]"; + this->AssertFilter(large_list(int32()), list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(large_list(int32()), list_json, "[0, 1, 1, null]", + "[[1,2], null, null]"); +} + class TestFilterKernelWithFixedSizeList : public TestFilterKernel {}; TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { diff --git a/cpp/src/arrow/compute/kernels/take-internal.h b/cpp/src/arrow/compute/kernels/take-internal.h index 96519a9869a..04e89d12d87 100644 --- a/cpp/src/arrow/compute/kernels/take-internal.h +++ b/cpp/src/arrow/compute/kernels/take-internal.h @@ -298,20 +298,23 @@ class TakerImpl : public Taker { int64_t length_ = 0; }; -template -class TakerImpl : public Taker { +template +class ListTakerImpl : public Taker { public: + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using Taker::Taker; Status Init() override { - const auto& list_type = checked_cast(*this->type_); + const auto& list_type = checked_cast(*this->type_); return Taker::Make(list_type.value_type(), &value_taker_); } Status SetContext(FunctionContext* ctx) override { auto pool = ctx->memory_pool(); null_bitmap_builder_.reset(new TypedBufferBuilder(pool)); - offset_builder_.reset(new TypedBufferBuilder(pool)); + offset_builder_.reset(new TypedBufferBuilder(pool)); RETURN_NOT_OK(offset_builder_->Append(0)); return value_taker_->SetContext(ctx); } @@ -319,12 +322,12 @@ class TakerImpl : public Taker { Status Take(const Array& values, IndexSequence indices) override { DCHECK(this->type_->Equals(values.type())); - const auto& list_array = checked_cast(values); + const auto& list_array = checked_cast(values); RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length())); RETURN_NOT_OK(offset_builder_->Reserve(indices.length())); - int32_t offset = offset_builder_->data()[offset_builder_->length() - 1]; + offset_type offset = offset_builder_->data()[offset_builder_->length() - 1]; return VisitIndices(indices, values, [&](int64_t index, bool is_valid) { null_bitmap_builder_->UnsafeAppend(is_valid); @@ -340,13 +343,7 @@ class TakerImpl : public Taker { }); } - Status Finish(std::shared_ptr* out) override { return FinishAs(out); } - - protected: - // this added method is provided for use by TakerImpl, - // which needs to construct a MapArray rather than a ListArray - template - Status FinishAs(std::shared_ptr* out) { + Status Finish(std::shared_ptr* out) override { auto null_count = null_bitmap_builder_->false_count(); auto length = null_bitmap_builder_->length(); @@ -357,24 +354,30 @@ class TakerImpl : public Taker { std::shared_ptr taken_values; RETURN_NOT_OK(value_taker_->Finish(&taken_values)); - out->reset( - new T(this->type_, length, offsets, taken_values, null_bitmap, null_count)); + out->reset(new ArrayType(this->type_, length, offsets, taken_values, null_bitmap, + null_count)); return Status::OK(); } std::unique_ptr> null_bitmap_builder_; - std::unique_ptr> offset_builder_; + std::unique_ptr> offset_builder_; std::unique_ptr> value_taker_; }; template -class TakerImpl : public TakerImpl { - public: - using TakerImpl::TakerImpl; +class TakerImpl : public ListTakerImpl { + using ListTakerImpl::ListTakerImpl; +}; - Status Finish(std::shared_ptr* out) override { - return this->template FinishAs(out); - } +template +class TakerImpl + : public ListTakerImpl { + using ListTakerImpl::ListTakerImpl; +}; + +template +class TakerImpl : public ListTakerImpl { + using ListTakerImpl::ListTakerImpl; }; template diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc index 7ae9321bb46..6a8e30b6e09 100644 --- a/cpp/src/arrow/compute/kernels/take-test.cc +++ b/cpp/src/arrow/compute/kernels/take-test.cc @@ -261,6 +261,15 @@ TEST_F(TestTakeKernelWithList, TakeListListInt32) { "[[], [], [], [], [], [], [[1], [2, null, 2], []]]"); } +class TestTakeKernelWithLargeList : public TestTakeKernel {}; + +TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) { + std::string list_json = "[[], [1,2], null, [3]]"; + this->AssertTake(large_list(int32()), list_json, "[]", "[]"); + this->AssertTake(large_list(int32()), list_json, "[null, 1, 2, 0]", + "[null, [1,2], null, []]"); +} + class TestTakeKernelWithFixedSizeList : public TestTakeKernel {}; TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index b08f974b58c..b51804ce141 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -43,8 +43,6 @@ std::string ExtensionType::ToString() const { return ss.str(); } -std::string ExtensionType::name() const { return "extension"; } - ExtensionArray::ExtensionArray(const std::shared_ptr& data) { SetData(data); } ExtensionArray::ExtensionArray(const std::shared_ptr& type, diff --git a/cpp/src/arrow/extension_type.h b/cpp/src/arrow/extension_type.h index 8bf4639bd12..37b749f30ff 100644 --- a/cpp/src/arrow/extension_type.h +++ b/cpp/src/arrow/extension_type.h @@ -34,13 +34,16 @@ class ARROW_EXPORT ExtensionType : public DataType { public: static constexpr Type::type type_id = Type::EXTENSION; + static constexpr const char* type_name() { return "extension"; } + /// \brief The type of array used to represent this extension type's data std::shared_ptr storage_type() const { return storage_type_; } DataTypeLayout layout() const override; std::string ToString() const override; - std::string name() const override; + + std::string name() const override { return "extension"; } /// \brief Unique name of extension type used to identify type for /// serialization diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 49a884e1f88..ddd68e2a6f6 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -165,7 +165,7 @@ class SchemaWriter { template typename std::enable_if::value || - std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value, void>::type WriteTypeMetadata(const T& type) {} @@ -334,6 +334,11 @@ class SchemaWriter { return Status::OK(); } + Status Visit(const LargeListType& type) { + WriteName("large_list", type); + return Status::OK(); + } + Status Visit(const MapType& type) { WriteName("map", type); return Status::OK(); @@ -583,11 +588,14 @@ class ArrayWriter { return VisitArrayValues(*array.indices()); } - Status Visit(const ListArray& array) { + template + typename std::enable_if::value || + std::is_base_of::value, + Status>::type + Visit(const T& array) { WriteValidityField(array); WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1); - const auto& type = checked_cast(*array.type()); - return WriteChildren(type.children(), {array.values()}); + return WriteChildren(array.type()->children(), {array.values()}); } Status Visit(const FixedSizeListArray& array) { @@ -948,6 +956,11 @@ static Status GetType(const RjObject& json_type, return Status::Invalid("List must have exactly one child"); } *type = list(children[0]); + } else if (type_name == "large_list") { + if (children.size() != 1) { + return Status::Invalid("Large list must have exactly one child"); + } + *type = large_list(children[0]); } else if (type_name == "map") { return GetMap(json_type, children, type); } else if (type_name == "fixedsizelist") { @@ -1263,15 +1276,23 @@ class ArrayReader { T* values = reinterpret_cast(buffer->mutable_data()); for (int i = 0; i < length; ++i) { const rj::Value& val = json_array[i]; - DCHECK(val.IsInt()); - values[i] = static_cast(val.GetInt()); + DCHECK(val.IsInt() || val.IsInt64()); + if (val.IsInt()) { + values[i] = static_cast(val.GetInt()); + } else { + values[i] = static_cast(val.GetInt64()); + } } *out = buffer; return Status::OK(); } + template Status CreateList(const std::shared_ptr& type, std::shared_ptr* out) { + using offset_type = typename T::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + int32_t null_count = 0; std::shared_ptr validity_buffer; RETURN_NOT_OK(GetValidityBuffer(is_valid_, &null_count, &validity_buffer)); @@ -1279,19 +1300,23 @@ class ArrayReader { const auto& json_offsets = obj_.FindMember("OFFSET"); RETURN_NOT_ARRAY("OFFSET", json_offsets, obj_); std::shared_ptr offsets_buffer; - RETURN_NOT_OK(GetIntArray(json_offsets->value.GetArray(), length_ + 1, - &offsets_buffer)); + RETURN_NOT_OK(GetIntArray(json_offsets->value.GetArray(), length_ + 1, + &offsets_buffer)); std::vector> children; RETURN_NOT_OK(GetChildren(obj_, *type, &children)); DCHECK_EQ(children.size(), 1); - out->reset(new ListArray(type, length_, offsets_buffer, children[0], validity_buffer, + out->reset(new ArrayType(type, length_, offsets_buffer, children[0], validity_buffer, null_count)); return Status::OK(); } - Status Visit(const ListType& type) { return CreateList(type_, &result_); } + Status Visit(const ListType& type) { return CreateList(type_, &result_); } + + Status Visit(const LargeListType& type) { + return CreateList(type_, &result_); + } Status Visit(const MapType& type) { auto list_type = std::make_shared(field( @@ -1299,7 +1324,7 @@ class ArrayReader { struct_({field("key", type.key_type(), false), field("value", type.item_type())}), false)); std::shared_ptr list_array; - RETURN_NOT_OK(CreateList(list_type, &list_array)); + RETURN_NOT_OK(CreateList(list_type, &list_array)); auto map_data = list_array->data(); map_data->type = type_; result_ = std::make_shared(map_data); diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 77ab7702109..c202402e8d0 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -551,6 +551,21 @@ TEST(TestList, IntegerListList) { } } +TEST(TestLargeList, Basics) { + // Similar as TestList above, only testing the basics + auto pool = default_memory_pool(); + std::shared_ptr type = large_list(int16()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[[], [null], [6, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + ASSERT_OK(LargeListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); +} + TEST(TestMap, IntegerToInteger) { auto type = map(int16(), int16()); std::shared_ptr expected, actual; diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index 20ac025665a..d13d346aa5d 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -415,15 +415,19 @@ class FixedSizeBinaryConverter final // ------------------------------------------------------------------------ // Converter for list arrays -class ListConverter final : public ConcreteConverter { +template +class ListConverter final : public ConcreteConverter> { public: - explicit ListConverter(const std::shared_ptr& type) { type_ = type; } + using BuilderType = typename TypeTraits::BuilderType; + + explicit ListConverter(const std::shared_ptr& type) { this->type_ = type; } Status Init() override { - const auto& list_type = checked_cast(*type_); + const auto& list_type = checked_cast(*this->type_); RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); auto child_builder = child_converter_->builder(); - builder_ = std::make_shared(default_memory_pool(), child_builder, type_); + builder_ = + std::make_shared(default_memory_pool(), child_builder, this->type_); return Status::OK(); } @@ -441,7 +445,7 @@ class ListConverter final : public ConcreteConverter { std::shared_ptr builder() override { return builder_; } private: - std::shared_ptr builder_; + std::shared_ptr builder_; std::shared_ptr child_converter_; }; @@ -734,7 +738,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) - SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, ListConverter) SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) diff --git a/cpp/src/arrow/ipc/json-test.cc b/cpp/src/arrow/ipc/json-test.cc index 338552dd575..fd8c96223f3 100644 --- a/cpp/src/arrow/ipc/json-test.cc +++ b/cpp/src/arrow/ipc/json-test.cc @@ -145,7 +145,9 @@ TEST(TestJsonSchemaWriter, FlatTypes) { field("f16", timestamp(TimeUnit::NANO)), field("f17", time64(TimeUnit::MICRO)), field("f18", union_({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))}, - {0, 1}, UnionMode::DENSE))}; + {0, 1}, UnionMode::DENSE)), + field("f19", large_list(uint8())), + }; Schema schema(fields); TestSchemaRoundTrip(schema); @@ -194,15 +196,24 @@ TEST(TestJsonArrayWriter, NestedTypes) { // List std::vector list_is_valid = {true, false, true, true, true}; - std::vector offsets = {0, 0, 0, 1, 4, 7}; - std::shared_ptr list_bitmap; ASSERT_OK(GetBitmapFromVector(list_is_valid, &list_bitmap)); + std::vector offsets = {0, 0, 0, 1, 4, 7}; std::shared_ptr offsets_buffer = Buffer::Wrap(offsets); + { + ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap, + 1); + TestArrayRoundTrip(list_array); + } - ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap, 1); - - TestArrayRoundTrip(list_array); + // LargeList + std::vector large_offsets = {0, 0, 0, 1, 4, 7}; + std::shared_ptr large_offsets_buffer = Buffer::Wrap(large_offsets); + { + LargeListArray list_array(large_list(value_type), 5, large_offsets_buffer, + values_array, list_bitmap, 1); + TestArrayRoundTrip(list_array); + } // Map auto map_type = map(utf8(), int32()); diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 93f859a0a07..c9e1bd77be2 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -322,6 +322,12 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, } *out = std::make_shared(children[0]); return Status::OK(); + case flatbuf::Type_LargeList: + if (children.size() != 1) { + return Status::Invalid("LargeList must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); case flatbuf::Type_Map: if (children.size() != 1) { return Status::Invalid("Map must have exactly 1 child field"); @@ -640,6 +646,13 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const LargeListType& type) { + fb_type_ = flatbuf::Type_LargeList; + RETURN_NOT_OK(AppendChildFields(fbb_, type, &children_, dictionary_memo_)); + type_offset_ = flatbuf::CreateLargeList(fbb_).Union(); + return Status::OK(); + } + Status Visit(const MapType& type) { fb_type_ = flatbuf::Type_Map; RETURN_NOT_OK(AppendChildFields(fbb_, type, &children_, dictionary_memo_)); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8ddec2e9aad..4a554aae208 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -42,6 +42,7 @@ #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -213,6 +214,21 @@ class ArrayLoader { return GetBuffer(context_->buffer_index++, &out_->buffers[2]); } + template + Status LoadList(const TYPE& type) { + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon()); + RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1])); + + const int num_children = type.num_children(); + if (num_children != 1) { + return Status::Invalid("Wrong number of children: ", num_children); + } + + return LoadChildren(type.children()); + } + Status LoadChild(const Field& field, ArrayData* out) { ArrayLoader loader(field, out, context_); --context_->max_recursion_depth; @@ -262,18 +278,9 @@ class ArrayLoader { return GetBuffer(context_->buffer_index++, &out_->buffers[1]); } - Status Visit(const ListType& type) { - out_->buffers.resize(2); - - RETURN_NOT_OK(LoadCommon()); - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1])); - - const int num_children = type.num_children(); - if (num_children != 1) { - return Status::Invalid("Wrong number of children: ", num_children); - } - - return LoadChildren(type.children()); + template + enable_if_base_list Visit(const T& type) { + return LoadList(type); } Status Visit(const FixedSizeListType& type) { diff --git a/cpp/src/arrow/ipc/test-common.cc b/cpp/src/arrow/ipc/test-common.cc index 4cf13ecc059..1cb40d98c98 100644 --- a/cpp/src/arrow/ipc/test-common.cc +++ b/cpp/src/arrow/ipc/test-common.cc @@ -72,9 +72,13 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool return Status::OK(); } -Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, - std::shared_ptr* out) { +template +static Status MakeListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + // Create the null list values std::vector valid_lists(num_lists); const double null_percent = include_nulls ? 0.1 : 0; @@ -83,39 +87,52 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li // Create list offsets const int max_list_size = 10; - std::vector list_sizes(num_lists, 0); - std::vector offsets( + std::vector list_sizes(num_lists, 0); + std::vector offsets( num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below. - const uint32_t seed = static_cast(child_array->length()); + const auto seed = static_cast(child_array->length()); if (num_lists > 0) { rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); // make sure sizes are consistent with null std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), list_sizes.begin(), - [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); + [](offset_type size, uint8_t valid) { return valid == 0 ? 0 : size; }); std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); // Force invariants - const int32_t child_length = static_cast(child_array->length()); + const auto child_length = static_cast(child_array->length()); offsets[0] = 0; std::replace_if(offsets.begin(), offsets.end(), - [child_length](int32_t offset) { return offset > child_length; }, + [child_length](offset_type offset) { return offset > child_length; }, child_length); } - offsets[num_lists] = static_cast(child_array->length()); + offsets[num_lists] = static_cast(child_array->length()); /// TODO(wesm): Implement support for nulls in ListArray::FromArrays std::shared_ptr null_bitmap, offsets_buffer; RETURN_NOT_OK(GetBitmapFromVector(valid_lists, &null_bitmap)); RETURN_NOT_OK(CopyBufferFromVector(offsets, pool, &offsets_buffer)); - *out = std::make_shared(list(child_array->type()), num_lists, offsets_buffer, - child_array, null_bitmap, kUnknownNullCount); + *out = std::make_shared(std::make_shared(child_array->type()), + num_lists, offsets_buffer, child_array, null_bitmap, + kUnknownNullCount); return ValidateArray(**out); } +Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + return MakeListArray(child_array, num_lists, include_nulls, pool, out); +} + +Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + return MakeListArray(child_array, num_lists, include_nulls, pool, out); +} + Status MakeRandomMapArray(const std::shared_ptr& key_array, const std::shared_ptr& item_array, int num_maps, bool include_nulls, MemoryPool* pool, @@ -274,22 +291,24 @@ Status MakeListRecordBatch(std::shared_ptr* out) { // Make the schema auto f0 = field("f0", list(int32())); auto f1 = field("f1", list(list(int32()))); - auto f2 = field("f2", int32()); + auto f2 = field("f2", large_list(int32())); auto schema = ::arrow::schema({f0, f1, f2}); // Example data MemoryPool* pool = default_memory_pool(); const int length = 200; - std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + std::shared_ptr leaf_values, list_array, list_list_array, large_list_array; const bool include_nulls = true; RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); RETURN_NOT_OK( MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); RETURN_NOT_OK( MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); - RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); - *out = RecordBatch::Make(schema, length, {list_array, list_list_array, flat_array}); + RETURN_NOT_OK(MakeRandomLargeListArray(leaf_values, length, include_nulls, pool, + &large_list_array)); + *out = + RecordBatch::Make(schema, length, {list_array, list_list_array, large_list_array}); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 0ec98349c7a..c2e56ab86f5 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -48,6 +48,11 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li bool include_nulls, MemoryPool* pool, std::shared_ptr* out); +ARROW_EXPORT +Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out); + ARROW_EXPORT Status MakeRandomMapArray(const std::shared_ptr& child_array, int num_lists, bool include_nulls, MemoryPool* pool, diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index ec372074d8b..b5c16cdaf44 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -278,16 +278,19 @@ class RecordBatchSerializer : public ArrayVisitor { return Status::OK(); } - Status VisitList(const ListArray& array) { + template + Status VisitList(const ArrayType& array) { + using offset_type = typename ArrayType::offset_type; + std::shared_ptr value_offsets; - RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); + RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); out_->body_buffers.emplace_back(value_offsets); --max_recursion_depth_; std::shared_ptr values = array.values(); - int32_t values_offset = 0; - int32_t values_length = 0; + offset_type values_offset = 0; + offset_type values_length = 0; if (value_offsets) { values_offset = array.value_offset(0); values_length = array.value_offset(array.length()) - values_offset; @@ -352,6 +355,8 @@ class RecordBatchSerializer : public ArrayVisitor { Status Visit(const ListArray& array) override { return VisitList(array); } + Status Visit(const LargeListArray& array) override { return VisitList(array); } + Status Visit(const MapArray& array) override { return VisitList(array); } Status Visit(const FixedSizeListArray& array) override { diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index cdb230c6c3e..1eb09c72ab0 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -349,7 +349,6 @@ TEST_F(TestPrettyPrint, BinaryType) { TEST_F(TestPrettyPrint, ListType) { auto list_type = list(int64()); - auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); static const char* ex = R"expected([ [ @@ -367,7 +366,6 @@ TEST_F(TestPrettyPrint, ListType) { 3 ] ])expected"; - CheckArray(*array, {0, 10}, ex); static const char* ex_2 = R"expected( [ [ null @@ -384,7 +382,6 @@ TEST_F(TestPrettyPrint, ListType) { 3 ] ])expected"; - CheckArray(*array, {2, 10}, ex_2); static const char* ex_3 = R"expected([ [ null @@ -395,6 +392,16 @@ TEST_F(TestPrettyPrint, ListType) { 3 ] ])expected"; + + auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + CheckArray(*array, {0, 10}, ex); + CheckArray(*array, {2, 10}, ex_2); + CheckStream(*array, {0, 1}, ex_3); + + list_type = large_list(int64()); + array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + CheckArray(*array, {0, 10}, ex); + CheckArray(*array, {2, 10}, ex_2); CheckStream(*array, {0, 1}, ex_3); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 5a54e13b889..88bd5470a80 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -247,6 +247,7 @@ class ArrayPrinter : public PrettyPrinter { template inline typename std::enable_if::value || + std::is_base_of::value || std::is_base_of::value, Status>::type WriteDataValues(const T& array) { @@ -320,6 +321,7 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value, Status>::type diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 9f14b0fa2be..7c3a4ee9a4b 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -87,12 +87,12 @@ Decimal128Scalar::Decimal128Scalar(const Decimal128& value, const std::shared_ptr& type, bool is_valid) : Scalar{type, is_valid}, value(value) {} -ListScalar::ListScalar(const std::shared_ptr& value, - const std::shared_ptr& type, bool is_valid) +BaseListScalar::BaseListScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid) : Scalar{type, is_valid}, value(value) {} -ListScalar::ListScalar(const std::shared_ptr& value, bool is_valid) - : ListScalar(value, value->type(), is_valid) {} +BaseListScalar::BaseListScalar(const std::shared_ptr& value, bool is_valid) + : BaseListScalar(value, value->type(), is_valid) {} MapScalar::MapScalar(const std::shared_ptr& keys, const std::shared_ptr& items, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 76aecd01fd3..09192896ba3 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -190,13 +190,21 @@ struct ARROW_EXPORT Decimal128Scalar : public Scalar { bool is_valid = true); }; -struct ARROW_EXPORT ListScalar : public Scalar { +struct ARROW_EXPORT BaseListScalar : public Scalar { std::shared_ptr value; - ListScalar(const std::shared_ptr& value, const std::shared_ptr& type, - bool is_valid = true); + BaseListScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true); - explicit ListScalar(const std::shared_ptr& value, bool is_valid = true); + BaseListScalar(const std::shared_ptr& value, bool is_valid); +}; + +struct ARROW_EXPORT ListScalar : public BaseListScalar { + using BaseListScalar::BaseListScalar; +}; + +struct ARROW_EXPORT LargeListScalar : public BaseListScalar { + using BaseListScalar::BaseListScalar; }; struct ARROW_EXPORT MapScalar : public Scalar { diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 7bfb7200171..eb492277503 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -404,6 +404,26 @@ TEST(TestListType, Basics) { ASSERT_EQ("list>", lt2.ToString()); } +TEST(TestLargeListType, Basics) { + std::shared_ptr vt = std::make_shared(); + + LargeListType list_type(vt); + ASSERT_EQ(list_type.id(), Type::LARGE_LIST); + + ASSERT_EQ("large_list", list_type.name()); + ASSERT_EQ("large_list", list_type.ToString()); + + ASSERT_EQ(list_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("large_list", lt->ToString()); + + LargeListType lt2(lt); + ASSERT_EQ("large_list>", lt2.ToString()); +} + TEST(TestMapType, Basics) { std::shared_ptr kt = std::make_shared(); std::shared_ptr it = std::make_shared(); @@ -563,6 +583,21 @@ TEST(TestTimestampType, ToString) { ASSERT_EQ("timestamp[us]", t4->ToString()); } +TEST(TestListType, Equals) { + auto t1 = list(utf8()); + auto t2 = list(utf8()); + auto t3 = list(binary()); + auto t4 = large_list(binary()); + auto t5 = large_list(binary()); + auto t6 = large_list(float64()); + + ASSERT_TRUE(t1->Equals(t2)); + ASSERT_FALSE(t1->Equals(t3)); + ASSERT_FALSE(t3->Equals(t4)); + ASSERT_TRUE(t4->Equals(t5)); + ASSERT_FALSE(t5->Equals(t6)); +} + TEST(TestNestedType, Equals) { auto create_struct = [](std::string inner_name, std::string struct_name) -> std::shared_ptr { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index dc00a796913..d8ed7bb2408 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -149,6 +149,12 @@ std::string ListType::ToString() const { return s.str(); } +std::string LargeListType::ToString() const { + std::stringstream s; + s << "large_list<" << value_field()->ToString() << ">"; + return s.str(); +} + MapType::MapType(const std::shared_ptr& key_type, const std::shared_ptr& item_type, bool keys_sorted) : ListType(std::make_shared( @@ -721,6 +727,14 @@ std::shared_ptr list(const std::shared_ptr& value_field) { return std::make_shared(value_field); } +std::shared_ptr large_list(const std::shared_ptr& value_type) { + return std::make_shared(value_type); +} + +std::shared_ptr large_list(const std::shared_ptr& value_field) { + return std::make_shared(value_field); +} + std::shared_ptr map(const std::shared_ptr& key_type, const std::shared_ptr& value_type, bool keys_sorted) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 572b888df11..753c73ebee0 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -149,7 +149,10 @@ struct Type { LARGE_STRING, /// Like BINARY, but with 64-bit offsets - LARGE_BINARY + LARGE_BINARY, + + /// Like LIST, but with 64-bit offsets + LARGE_LIST }; }; @@ -348,6 +351,8 @@ class ARROW_EXPORT CTypeImpl : public BASE { DataTypeLayout layout() const override { return {{1, bit_width()}, false}; } + std::string name() const override { return DERIVED::type_name(); } + std::string ToString() const override { return this->name(); } }; @@ -363,6 +368,8 @@ class ARROW_EXPORT NullType : public DataType, public NoExtraMeta { public: static constexpr Type::type type_id = Type::NA; + static constexpr const char* type_name() { return "null"; } + NullType() : DataType(Type::NA) {} std::string ToString() const override; @@ -379,6 +386,8 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { public: static constexpr Type::type type_id = Type::BOOL; + static constexpr const char* type_name() { return "bool"; } + BooleanType() : FixedWidthType(Type::BOOL) {} std::string ToString() const override; @@ -386,6 +395,7 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { DataTypeLayout layout() const override { return {{1, 1}, false}; } int bit_width() const override { return 1; } + std::string name() const override { return "bool"; } }; @@ -393,56 +403,56 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { class ARROW_EXPORT UInt8Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "uint8"; } + static constexpr const char* type_name() { return "uint8"; } }; /// Concrete type class for signed 8-bit integer data class ARROW_EXPORT Int8Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "int8"; } + static constexpr const char* type_name() { return "int8"; } }; /// Concrete type class for unsigned 16-bit integer data class ARROW_EXPORT UInt16Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "uint16"; } + static constexpr const char* type_name() { return "uint16"; } }; /// Concrete type class for signed 16-bit integer data class ARROW_EXPORT Int16Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "int16"; } + static constexpr const char* type_name() { return "int16"; } }; /// Concrete type class for unsigned 32-bit integer data class ARROW_EXPORT UInt32Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "uint32"; } + static constexpr const char* type_name() { return "uint32"; } }; /// Concrete type class for signed 32-bit integer data class ARROW_EXPORT Int32Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "int32"; } + static constexpr const char* type_name() { return "int32"; } }; /// Concrete type class for unsigned 64-bit integer data class ARROW_EXPORT UInt64Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "uint64"; } + static constexpr const char* type_name() { return "uint64"; } }; /// Concrete type class for signed 64-bit integer data class ARROW_EXPORT Int64Type : public detail::IntegerTypeImpl { public: - std::string name() const override { return "int64"; } + static constexpr const char* type_name() { return "int64"; } }; /// Concrete type class for 16-bit floating-point data @@ -451,7 +461,7 @@ class ARROW_EXPORT HalfFloatType uint16_t> { public: Precision precision() const override; - std::string name() const override { return "halffloat"; } + static constexpr const char* type_name() { return "halffloat"; } }; /// Concrete type class for 32-bit floating-point data (C "float") @@ -459,7 +469,7 @@ class ARROW_EXPORT FloatType : public detail::CTypeImpl { public: Precision precision() const override; - std::string name() const override { return "float"; } + static constexpr const char* type_name() { return "float"; } }; /// Concrete type class for 64-bit floating-point data (C "double") @@ -467,7 +477,13 @@ class ARROW_EXPORT DoubleType : public detail::CTypeImpl { public: Precision precision() const override; - std::string name() const override { return "double"; } + static constexpr const char* type_name() { return "double"; } +}; + +/// \brief Base class for all variable-size list data types +class ARROW_EXPORT BaseListType : public NestedType { + public: + using NestedType::NestedType; }; /// \brief Concrete type class for list data @@ -475,16 +491,18 @@ class ARROW_EXPORT DoubleType /// List data is nested data where each value is a variable number of /// child items. Lists can be recursively nested, for example /// list(list(int32)). -class ARROW_EXPORT ListType : public NestedType { +class ARROW_EXPORT ListType : public BaseListType { public: static constexpr Type::type type_id = Type::LIST; using offset_type = int32_t; + static constexpr const char* type_name() { return "list"; } + // List can contain any other logical value type explicit ListType(const std::shared_ptr& value_type) : ListType(std::make_shared("item", value_type)) {} - explicit ListType(const std::shared_ptr& value_field) : NestedType(Type::LIST) { + explicit ListType(const std::shared_ptr& value_field) : BaseListType(type_id) { children_ = {value_field}; } @@ -501,6 +519,38 @@ class ARROW_EXPORT ListType : public NestedType { std::string name() const override { return "list"; } }; +/// \brief Concrete type class for large list data +/// +/// LargeListType is like ListType but with 64-bit rather than 32-bit offsets. +class ARROW_EXPORT LargeListType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LARGE_LIST; + using offset_type = int64_t; + + static constexpr const char* type_name() { return "large_list"; } + + // List can contain any other logical value type + explicit LargeListType(const std::shared_ptr& value_type) + : LargeListType(std::make_shared("item", value_type)) {} + + explicit LargeListType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + std::shared_ptr value_field() const { return children_[0]; } + + std::shared_ptr value_type() const { return children_[0]->type(); } + + DataTypeLayout layout() const override { + return {{1, CHAR_BIT * sizeof(offset_type)}, false}; + } + + std::string ToString() const override; + + std::string name() const override { return "large_list"; } +}; + /// \brief Concrete type class for map data /// /// Map data is nested data where each value is a variable number of @@ -510,6 +560,8 @@ class ARROW_EXPORT MapType : public ListType { public: static constexpr Type::type type_id = Type::MAP; + static constexpr const char* type_name() { return "map"; } + MapType(const std::shared_ptr& key_type, const std::shared_ptr& item_type, bool keys_sorted = false); @@ -532,6 +584,8 @@ class ARROW_EXPORT FixedSizeListType : public NestedType { public: static constexpr Type::type type_id = Type::FIXED_SIZE_LIST; + static constexpr const char* type_name() { return "fixed_size_list"; } + // List can contain any other logical value type FixedSizeListType(const std::shared_ptr& value_type, int32_t list_size) : FixedSizeListType(std::make_shared("item", value_type), list_size) {} @@ -570,6 +624,8 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { static constexpr bool is_utf8 = false; using offset_type = int32_t; + static constexpr const char* type_name() { return "binary"; } + BinaryType() : BinaryType(Type::BINARY) {} DataTypeLayout layout() const override { @@ -592,6 +648,8 @@ class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { static constexpr bool is_utf8 = false; using offset_type = int64_t; + static constexpr const char* type_name() { return "large_binary"; } + LargeBinaryType() : LargeBinaryType(Type::LARGE_BINARY) {} DataTypeLayout layout() const override { @@ -613,6 +671,8 @@ class ARROW_EXPORT StringType : public BinaryType { static constexpr Type::type type_id = Type::STRING; static constexpr bool is_utf8 = true; + static constexpr const char* type_name() { return "utf8"; } + StringType() : BinaryType(Type::STRING) {} std::string ToString() const override; @@ -625,6 +685,8 @@ class ARROW_EXPORT LargeStringType : public LargeBinaryType { static constexpr Type::type type_id = Type::LARGE_STRING; static constexpr bool is_utf8 = true; + static constexpr const char* type_name() { return "large_utf8"; } + LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {} std::string ToString() const override; @@ -636,6 +698,8 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri public: static constexpr Type::type type_id = Type::FIXED_SIZE_BINARY; + static constexpr const char* type_name() { return "fixed_size_binary"; } + explicit FixedSizeBinaryType(int32_t byte_width) : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {} explicit FixedSizeBinaryType(int32_t byte_width, Type::type override_type_id) @@ -658,6 +722,8 @@ class ARROW_EXPORT StructType : public NestedType { public: static constexpr Type::type type_id = Type::STRUCT; + static constexpr const char* type_name() { return "struct"; } + explicit StructType(const std::vector>& fields); ~StructType() override; @@ -712,6 +778,8 @@ class ARROW_EXPORT Decimal128Type : public DecimalType { public: static constexpr Type::type type_id = Type::DECIMAL; + static constexpr const char* type_name() { return "decimal"; } + explicit Decimal128Type(int32_t precision, int32_t scale); std::string ToString() const override; @@ -727,6 +795,8 @@ class ARROW_EXPORT UnionType : public NestedType { public: static constexpr Type::type type_id = Type::UNION; + static constexpr const char* type_name() { return "union"; } + UnionType(const std::vector>& fields, const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); @@ -779,6 +849,8 @@ class ARROW_EXPORT Date32Type : public DateType { static constexpr Type::type type_id = Type::DATE32; static constexpr DateUnit UNIT = DateUnit::DAY; + static constexpr const char* type_name() { return "date32"; } + using c_type = int32_t; Date32Type(); @@ -797,6 +869,8 @@ class ARROW_EXPORT Date64Type : public DateType { static constexpr Type::type type_id = Type::DATE64; static constexpr DateUnit UNIT = DateUnit::MILLI; + static constexpr const char* type_name() { return "date64"; } + using c_type = int64_t; Date64Type(); @@ -834,6 +908,8 @@ class ARROW_EXPORT Time32Type : public TimeType { static constexpr Type::type type_id = Type::TIME32; using c_type = int32_t; + static constexpr const char* type_name() { return "time32"; } + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } explicit Time32Type(TimeUnit::type unit = TimeUnit::MILLI); @@ -850,6 +926,8 @@ class ARROW_EXPORT Time64Type : public TimeType { static constexpr Type::type type_id = Type::TIME64; using c_type = int64_t; + static constexpr const char* type_name() { return "time64"; } + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } explicit Time64Type(TimeUnit::type unit = TimeUnit::NANO); @@ -898,6 +976,8 @@ class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType { typedef int64_t c_type; static constexpr Type::type type_id = Type::TIMESTAMP; + static constexpr const char* type_name() { return "timestamp"; } + int bit_width() const override { return static_cast(sizeof(int64_t) * CHAR_BIT); } explicit TimestampType(TimeUnit::type unit = TimeUnit::MILLI) @@ -936,6 +1016,8 @@ class ARROW_EXPORT MonthIntervalType : public IntervalType { using c_type = int32_t; static constexpr Type::type type_id = Type::INTERVAL; + static constexpr const char* type_name() { return "month_interval"; } + IntervalType::type interval_type() const override { return IntervalType::MONTHS; } int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } @@ -961,6 +1043,9 @@ class ARROW_EXPORT DayTimeIntervalType : public IntervalType { static_assert(sizeof(DayMilliseconds) == 8, "DayMilliseconds struct assumed to be of size 8 bytes"); static constexpr Type::type type_id = Type::INTERVAL; + + static constexpr const char* type_name() { return "day_time_interval"; } + IntervalType::type interval_type() const override { return IntervalType::DAY_TIME; } DayTimeIntervalType() : IntervalType() {} @@ -980,6 +1065,8 @@ class ARROW_EXPORT DurationType : public TemporalType, public ParametricType { static constexpr Type::type type_id = Type::DURATION; using c_type = int64_t; + static constexpr const char* type_name() { return "duration"; } + int bit_width() const override { return static_cast(sizeof(int64_t) * CHAR_BIT); } explicit DurationType(TimeUnit::type unit = TimeUnit::MILLI) @@ -1004,6 +1091,8 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { public: static constexpr Type::type type_id = Type::DICTIONARY; + static constexpr const char* type_name() { return "dictionary"; } + DictionaryType(const std::shared_ptr& index_type, const std::shared_ptr& value_type, bool ordered = false); @@ -1146,6 +1235,14 @@ std::shared_ptr list(const std::shared_ptr& value_type); ARROW_EXPORT std::shared_ptr list(const std::shared_ptr& value_type); +/// \brief Create a LargeListType instance from its child Field type +ARROW_EXPORT +std::shared_ptr large_list(const std::shared_ptr& value_type); + +/// \brief Create a LargeListType instance from its child DataType +ARROW_EXPORT +std::shared_ptr large_list(const std::shared_ptr& value_type); + /// \brief Create a MapType instance from its key and value DataTypes ARROW_EXPORT std::shared_ptr map(const std::shared_ptr& key_type, diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 9935af5111f..0711efc5f08 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -90,6 +90,11 @@ class ListArray; class ListBuilder; struct ListScalar; +class LargeListType; +class LargeListArray; +class LargeListBuilder; +struct LargeListScalar; + class MapType; class MapArray; class MapBuilder; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index df3e280d3f2..2d05b5c4321 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -299,6 +299,20 @@ struct TypeTraits { using ArrayType = ListArray; using BuilderType = ListBuilder; using ScalarType = ListScalar; + using OffsetType = Int32Type; + using OffsetArrayType = Int32Array; + using OffsetBuilderType = Int32Builder; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = LargeListArray; + using BuilderType = LargeListBuilder; + using ScalarType = LargeListScalar; + using OffsetType = Int64Type; + using OffsetArrayType = Int64Array; + using OffsetBuilderType = Int64Builder; constexpr static bool is_parameter_free = false; }; @@ -468,10 +482,18 @@ template using enable_if_fixed_size_binary = typename std::enable_if::value, R>::type; +template +using enable_if_base_list = + typename std::enable_if::value, R>::type; + template using enable_if_list = typename std::enable_if::value, R>::type; +template +using enable_if_large_list = + typename std::enable_if::value, R>::type; + template using enable_if_fixed_size_list = typename std::enable_if::value, R>::type; diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index 2ec6c6421d0..cb4f165af81 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -59,6 +59,7 @@ ARRAY_VISITOR_DEFAULT(DayTimeIntervalArray) ARRAY_VISITOR_DEFAULT(MonthIntervalArray) ARRAY_VISITOR_DEFAULT(DurationArray) ARRAY_VISITOR_DEFAULT(ListArray) +ARRAY_VISITOR_DEFAULT(LargeListArray) ARRAY_VISITOR_DEFAULT(MapArray) ARRAY_VISITOR_DEFAULT(FixedSizeListArray) ARRAY_VISITOR_DEFAULT(StructArray) @@ -105,6 +106,7 @@ TYPE_VISITOR_DEFAULT(MonthIntervalType) TYPE_VISITOR_DEFAULT(DurationType) TYPE_VISITOR_DEFAULT(Decimal128Type) TYPE_VISITOR_DEFAULT(ListType) +TYPE_VISITOR_DEFAULT(LargeListType) TYPE_VISITOR_DEFAULT(MapType) TYPE_VISITOR_DEFAULT(FixedSizeListType) TYPE_VISITOR_DEFAULT(StructType) @@ -151,6 +153,7 @@ SCALAR_VISITOR_DEFAULT(MonthIntervalScalar) SCALAR_VISITOR_DEFAULT(DurationScalar) SCALAR_VISITOR_DEFAULT(Decimal128Scalar) SCALAR_VISITOR_DEFAULT(ListScalar) +SCALAR_VISITOR_DEFAULT(LargeListScalar) SCALAR_VISITOR_DEFAULT(MapScalar) SCALAR_VISITOR_DEFAULT(FixedSizeListScalar) SCALAR_VISITOR_DEFAULT(StructScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 1c854c47804..825c1722dbc 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -56,6 +56,7 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const DurationArray& array); virtual Status Visit(const Decimal128Array& array); virtual Status Visit(const ListArray& array); + virtual Status Visit(const LargeListArray& array); virtual Status Visit(const MapArray& array); virtual Status Visit(const FixedSizeListArray& array); virtual Status Visit(const StructArray& array); @@ -96,6 +97,7 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const DurationType& type); virtual Status Visit(const Decimal128Type& type); virtual Status Visit(const ListType& type); + virtual Status Visit(const LargeListType& type); virtual Status Visit(const MapType& type); virtual Status Visit(const FixedSizeListType& type); virtual Status Visit(const StructType& type); @@ -136,6 +138,7 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const DurationScalar& scalar); virtual Status Visit(const Decimal128Scalar& scalar); virtual Status Visit(const ListScalar& scalar); + virtual Status Visit(const LargeListScalar& scalar); virtual Status Visit(const MapScalar& scalar); virtual Status Visit(const FixedSizeListScalar& scalar); virtual Status Visit(const StructScalar& scalar); diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 3ed058e6492..84d89da0dad 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -58,6 +58,7 @@ namespace arrow { ACTION(Time64); \ ACTION(Decimal128); \ ACTION(List); \ + ACTION(LargeList); \ ACTION(Map); \ ACTION(FixedSizeList); \ ACTION(Struct); \ diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index a786065f4d3..bb2bab1335a 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -111,6 +111,7 @@ class LevelBuilder { " not supported yet"); \ } + NOT_IMPLEMENTED_VISIT(LargeList) NOT_IMPLEMENTED_VISIT(Map) NOT_IMPLEMENTED_VISIT(FixedSizeList) NOT_IMPLEMENTED_VISIT(Struct) @@ -118,6 +119,8 @@ class LevelBuilder { NOT_IMPLEMENTED_VISIT(Dictionary) NOT_IMPLEMENTED_VISIT(Extension) +#undef NOT_IMPLEMENTED_VISIT + Status GenerateLevels(const Array& array, const std::shared_ptr& field, int64_t* values_offset, int64_t* num_values, int64_t* num_levels, const std::shared_ptr& def_levels_scratch, diff --git a/format/Schema.fbs b/format/Schema.fbs index 06bcf6ee670..4ce66d66002 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -47,6 +47,11 @@ table Struct_ { table List { } +/// Same as List, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeList { +} + table FixedSizeList { /// Number of list items per value listSize: int; @@ -248,6 +253,7 @@ union Type { Duration, LargeBinary, LargeUtf8, + LargeList, } /// ----------------------------------------------------------------------