diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc index cb8d6d53064..85a1620f0f1 100644 --- a/cpp/src/arrow/array-binary-test.cc +++ b/cpp/src/arrow/array-binary-test.cc @@ -40,6 +40,9 @@ namespace arrow { using internal::checked_cast; +using StringTypes = + ::testing::Types; + // ---------------------------------------------------------------------- // String / Binary tests @@ -67,8 +70,14 @@ void CheckStringArray(const ArrayType& array, const std::vector& st } } +template class TestStringArray : public ::testing::Test { public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + void SetUp() { chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; offsets_ = {0, 1, 1, 1, 3, 6}; @@ -85,268 +94,132 @@ class TestStringArray : public ::testing::Test { ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_)); null_count_ = CountNulls(valid_bytes_); - strings_ = std::make_shared(length_, offsets_buf_, value_buf_, - null_bitmap_, null_count_); - } - - protected: - std::vector offsets_; - std::vector chars_; - std::vector valid_bytes_; - - std::vector expected_; - - std::shared_ptr value_buf_; - std::shared_ptr offsets_buf_; - std::shared_ptr null_bitmap_; - - int64_t null_count_; - int64_t length_; - - std::shared_ptr strings_; -}; - -TEST_F(TestStringArray, TestArrayBasics) { - ASSERT_EQ(length_, strings_->length()); - ASSERT_EQ(1, strings_->null_count()); - ASSERT_OK(ValidateArray(*strings_)); -} - -TEST_F(TestStringArray, TestType) { - std::shared_ptr type = strings_->type(); - - ASSERT_EQ(Type::STRING, type->id()); - ASSERT_EQ(Type::STRING, strings_->type_id()); -} - -TEST_F(TestStringArray, TestListFunctions) { - int pos = 0; - for (size_t i = 0; i < expected_.size(); ++i) { - ASSERT_EQ(pos, strings_->value_offset(i)); - ASSERT_EQ(static_cast(expected_[i].size()), strings_->value_length(i)); - pos += static_cast(expected_[i].size()); - } -} - -TEST_F(TestStringArray, TestDestructor) { - auto arr = std::make_shared(length_, offsets_buf_, value_buf_, + strings_ = std::make_shared(length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); -} + } -TEST_F(TestStringArray, TestGetString) { - for (size_t i = 0; i < expected_.size(); ++i) { - if (valid_bytes_[i] == 0) { - ASSERT_TRUE(strings_->IsNull(i)); + void TestArrayBasics() { + ASSERT_EQ(length_, strings_->length()); + ASSERT_EQ(1, strings_->null_count()); + ASSERT_OK(ValidateArray(*strings_)); + TestInitialized(*strings_); + AssertZeroPadded(*strings_); + } + + void TestType() { + std::shared_ptr type = this->strings_->type(); + + if (std::is_same::value) { + ASSERT_EQ(Type::STRING, type->id()); + ASSERT_EQ(Type::STRING, this->strings_->type_id()); + } else if (std::is_same::value) { + ASSERT_EQ(Type::LARGE_STRING, type->id()); + ASSERT_EQ(Type::LARGE_STRING, this->strings_->type_id()); + } else if (std::is_same::value) { + ASSERT_EQ(Type::BINARY, type->id()); + ASSERT_EQ(Type::BINARY, this->strings_->type_id()); + } else if (std::is_same::value) { + ASSERT_EQ(Type::LARGE_BINARY, type->id()); + ASSERT_EQ(Type::LARGE_BINARY, this->strings_->type_id()); } else { - ASSERT_EQ(expected_[i], strings_->GetString(i)); + FAIL(); } } -} - -TEST_F(TestStringArray, TestEmptyStringComparison) { - offsets_ = {0, 0, 0, 0, 0, 0}; - offsets_buf_ = Buffer::Wrap(offsets_); - length_ = static_cast(offsets_.size() - 1); - - auto strings_a = std::make_shared(length_, offsets_buf_, nullptr, - null_bitmap_, null_count_); - auto strings_b = std::make_shared(length_, offsets_buf_, nullptr, - null_bitmap_, null_count_); - ASSERT_TRUE(strings_a->Equals(strings_b)); -} - -TEST_F(TestStringArray, CompareNullByteSlots) { - StringBuilder builder; - StringBuilder builder2; - StringBuilder builder3; - - ASSERT_OK(builder.Append("foo")); - ASSERT_OK(builder2.Append("foo")); - ASSERT_OK(builder3.Append("foo")); - - ASSERT_OK(builder.Append("bar")); - ASSERT_OK(builder2.AppendNull()); - - // same length, but different - ASSERT_OK(builder3.Append("xyz")); - - ASSERT_OK(builder.Append("baz")); - ASSERT_OK(builder2.Append("baz")); - ASSERT_OK(builder3.Append("baz")); - - std::shared_ptr array, array2, array3; - FinishAndCheckPadding(&builder, &array); - ASSERT_OK(builder2.Finish(&array2)); - ASSERT_OK(builder3.Finish(&array3)); - - const auto& a1 = checked_cast(*array); - const auto& a2 = checked_cast(*array2); - const auto& a3 = checked_cast(*array3); - - // The validity bitmaps are the same, the data is different, but the unequal - // portion is masked out - StringArray equal_array(3, a1.value_offsets(), a1.value_data(), a2.null_bitmap(), 1); - StringArray equal_array2(3, a3.value_offsets(), a3.value_data(), a2.null_bitmap(), 1); - ASSERT_TRUE(equal_array.Equals(equal_array2)); - ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0)); - - ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1))); - ASSERT_TRUE( - equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); -} - -TEST_F(TestStringArray, TestSliceGetString) { - StringBuilder builder; - - ASSERT_OK(builder.Append("a")); - ASSERT_OK(builder.Append("b")); - ASSERT_OK(builder.Append("c")); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); - auto s = array->Slice(1, 10); - auto arr = std::dynamic_pointer_cast(s); - ASSERT_EQ(arr->GetString(0), "b"); -} - -// ---------------------------------------------------------------------- -// String builder tests - -class TestStringBuilder : public TestBuilder { - public: - void SetUp() { - TestBuilder::SetUp(); - builder_.reset(new StringBuilder(pool_)); + void TestListFunctions() { + int64_t pos = 0; + for (size_t i = 0; i < expected_.size(); ++i) { + ASSERT_EQ(pos, strings_->value_offset(i)); + ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); + pos += expected_[i].size(); + } } - void Done() { - std::shared_ptr out; - FinishAndCheckPadding(builder_.get(), &out); - - result_ = std::dynamic_pointer_cast(out); - ASSERT_OK(ValidateArray(*result_)); + void TestDestructor() { + auto arr = std::make_shared(length_, offsets_buf_, value_buf_, + null_bitmap_, null_count_); } - protected: - std::unique_ptr builder_; - std::shared_ptr result_; -}; - -TEST_F(TestStringBuilder, TestScalarAppend) { - std::vector strings = {"", "bb", "a", "", "ccc"}; - std::vector is_valid = {1, 1, 1, 0, 1}; - - int N = static_cast(strings.size()); - int reps = 1000; - - for (int j = 0; j < reps; ++j) { - for (int i = 0; i < N; ++i) { - if (!is_valid[i]) { - ASSERT_OK(builder_->AppendNull()); + void TestGetString() { + for (size_t i = 0; i < expected_.size(); ++i) { + if (valid_bytes_[i] == 0) { + ASSERT_TRUE(strings_->IsNull(i)); } else { - ASSERT_OK(builder_->Append(strings[i])); + ASSERT_FALSE(strings_->IsNull(i)); + ASSERT_EQ(expected_[i], strings_->GetString(i)); } } } - Done(); - ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->value_data()->size()); - - CheckStringArray(*result_, strings, is_valid, reps); -} - -TEST_F(TestStringBuilder, TestAppendVector) { - std::vector strings = {"", "bb", "a", "", "ccc"}; - std::vector valid_bytes = {1, 1, 1, 0, 1}; - - int N = static_cast(strings.size()); - int reps = 1000; + void TestEmptyStringComparison() { + offsets_ = {0, 0, 0, 0, 0, 0}; + offsets_buf_ = Buffer::Wrap(offsets_); + length_ = static_cast(offsets_.size() - 1); - for (int j = 0; j < reps; ++j) { - ASSERT_OK(builder_->AppendValues(strings, valid_bytes.data())); + auto strings_a = std::make_shared(length_, offsets_buf_, nullptr, + null_bitmap_, null_count_); + auto strings_b = std::make_shared(length_, offsets_buf_, nullptr, + null_bitmap_, null_count_); + ASSERT_TRUE(strings_a->Equals(strings_b)); } - Done(); - ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->value_data()->size()); + void TestCompareNullByteSlots() { + BuilderType builder; + BuilderType builder2; + BuilderType builder3; - CheckStringArray(*result_, strings, valid_bytes, reps); -} + ASSERT_OK(builder.Append("foo")); + ASSERT_OK(builder2.Append("foo")); + ASSERT_OK(builder3.Append("foo")); -TEST_F(TestStringBuilder, TestAppendCStringsWithValidBytes) { - const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""}; - std::vector valid_bytes = {1, 1, 1, 0, 1}; + ASSERT_OK(builder.Append("bar")); + ASSERT_OK(builder2.AppendNull()); - int N = static_cast(sizeof(strings) / sizeof(strings[0])); - int reps = 1000; + // same length, but different + ASSERT_OK(builder3.Append("xyz")); - for (int j = 0; j < reps; ++j) { - ASSERT_OK(builder_->AppendValues(strings, N, valid_bytes.data())); - } - Done(); + ASSERT_OK(builder.Append("baz")); + ASSERT_OK(builder2.Append("baz")); + ASSERT_OK(builder3.Append("baz")); - ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps * 3, result_->null_count()); - ASSERT_EQ(reps * 3, result_->value_data()->size()); + std::shared_ptr array, array2, array3; + FinishAndCheckPadding(&builder, &array); + ASSERT_OK(builder2.Finish(&array2)); + ASSERT_OK(builder3.Finish(&array3)); - CheckStringArray(*result_, {"", "aaa", "", "", ""}, {0, 1, 0, 0, 1}, reps); -} + const auto& a1 = checked_cast(*array); + const auto& a2 = checked_cast(*array2); + const auto& a3 = checked_cast(*array3); -TEST_F(TestStringBuilder, TestAppendCStringsWithoutValidBytes) { - const char* strings[] = {"", "bb", "a", nullptr, "ccc"}; + // The validity bitmaps are the same, the data is different, but the unequal + // portion is masked out + ArrayType equal_array(3, a1.value_offsets(), a1.value_data(), a2.null_bitmap(), 1); + ArrayType equal_array2(3, a3.value_offsets(), a3.value_data(), a2.null_bitmap(), 1); - int N = static_cast(sizeof(strings) / sizeof(strings[0])); - int reps = 1000; + ASSERT_TRUE(equal_array.Equals(equal_array2)); + ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0)); - for (int j = 0; j < reps; ++j) { - ASSERT_OK(builder_->AppendValues(strings, N)); + ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1))); + ASSERT_TRUE( + equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1))); } - Done(); - ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->value_data()->size()); + void TestSliceGetString() { + BuilderType builder; - CheckStringArray(*result_, {"", "bb", "a", "", "ccc"}, {1, 1, 1, 0, 1}, reps); -} + ASSERT_OK(builder.Append("a")); + ASSERT_OK(builder.Append("b")); + ASSERT_OK(builder.Append("c")); -TEST_F(TestStringBuilder, TestZeroLength) { - // All buffers are null - Done(); -} - -// Binary container type -// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating -// with String -class TestBinaryArray : public ::testing::Test { - public: - void SetUp() { - chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; - offsets_ = {0, 1, 1, 1, 3, 6}; - valid_bytes_ = {1, 1, 0, 1, 1}; - expected_ = {"a", "", "", "bb", "ccc"}; - - MakeArray(); - } - - void MakeArray() { - length_ = static_cast(offsets_.size() - 1); - value_buf_ = Buffer::Wrap(chars_); - offsets_buf_ = Buffer::Wrap(offsets_); - - ASSERT_OK(BitUtil::BytesToBits(valid_bytes_, default_memory_pool(), &null_bitmap_)); - null_count_ = CountNulls(valid_bytes_); - - strings_ = std::make_shared(length_, offsets_buf_, value_buf_, - null_bitmap_, null_count_); + std::shared_ptr array; + ASSERT_OK(builder.Finish(&array)); + auto s = array->Slice(1, 10); + auto arr = std::dynamic_pointer_cast(s); + ASSERT_EQ(arr->GetString(0), "b"); } protected: - std::vector offsets_; + std::vector offsets_; std::vector chars_; std::vector valid_bytes_; @@ -359,300 +232,240 @@ class TestBinaryArray : public ::testing::Test { int64_t null_count_; int64_t length_; - std::shared_ptr strings_; + std::shared_ptr strings_; }; -TEST_F(TestBinaryArray, TestArrayBasics) { - ASSERT_EQ(length_, strings_->length()); - ASSERT_EQ(1, strings_->null_count()); - ASSERT_OK(ValidateArray(*strings_)); -} - -TEST_F(TestBinaryArray, TestType) { - std::shared_ptr type = strings_->type(); - - ASSERT_EQ(Type::BINARY, type->id()); - ASSERT_EQ(Type::BINARY, strings_->type_id()); -} +TYPED_TEST_CASE(TestStringArray, StringTypes); -TEST_F(TestBinaryArray, TestListFunctions) { - size_t pos = 0; - for (size_t i = 0; i < expected_.size(); ++i) { - ASSERT_EQ(pos, strings_->value_offset(i)); - ASSERT_EQ(static_cast(expected_[i].size()), strings_->value_length(i)); - pos += expected_[i].size(); - } -} +TYPED_TEST(TestStringArray, TestArrayBasics) { this->TestArrayBasics(); } -TEST_F(TestBinaryArray, TestDestructor) { - auto arr = std::make_shared(length_, offsets_buf_, value_buf_, - null_bitmap_, null_count_); -} +TYPED_TEST(TestStringArray, TestType) { this->TestType(); } -TEST_F(TestBinaryArray, TestGetValue) { - for (size_t i = 0; i < expected_.size(); ++i) { - if (valid_bytes_[i] == 0) { - ASSERT_TRUE(strings_->IsNull(i)); - } else { - ASSERT_FALSE(strings_->IsNull(i)); - ASSERT_EQ(strings_->GetString(i), expected_[i]); - } - } -} +TYPED_TEST(TestStringArray, TestListFunctions) { this->TestListFunctions(); } -TEST_F(TestBinaryArray, TestNullValuesInitialized) { - for (size_t i = 0; i < expected_.size(); ++i) { - if (valid_bytes_[i] == 0) { - ASSERT_TRUE(strings_->IsNull(i)); - } else { - ASSERT_FALSE(strings_->IsNull(i)); - ASSERT_EQ(strings_->GetString(i), expected_[i]); - } - } - TestInitialized(*strings_); -} +TYPED_TEST(TestStringArray, TestDestructor) { this->TestDestructor(); } -TEST_F(TestBinaryArray, TestPaddingZeroed) { AssertZeroPadded(*strings_); } +TYPED_TEST(TestStringArray, TestGetString) { this->TestGetString(); } -TEST_F(TestBinaryArray, TestGetString) { - for (size_t i = 0; i < expected_.size(); ++i) { - if (valid_bytes_[i] == 0) { - ASSERT_TRUE(strings_->IsNull(i)); - } else { - std::string val = strings_->GetString(i); - ASSERT_EQ(0, std::memcmp(expected_[i].data(), val.c_str(), val.size())); - } - } +TYPED_TEST(TestStringArray, TestEmptyStringComparison) { + this->TestEmptyStringComparison(); } -TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { - BinaryBuilder builder; - - std::string empty_string(""); - for (int i = 0; i < 5; ++i) { - ASSERT_OK(builder.Append(empty_string)); - } - - std::shared_ptr left_arr; - FinishAndCheckPadding(&builder, &left_arr); +TYPED_TEST(TestStringArray, CompareNullByteSlots) { this->TestCompareNullByteSlots(); } - const BinaryArray& left = checked_cast(*left_arr); - std::shared_ptr right = - std::make_shared(left.length(), left.value_offsets(), nullptr, - left.null_bitmap(), left.null_count()); +TYPED_TEST(TestStringArray, TestSliceGetString) { this->TestSliceGetString(); } - ASSERT_TRUE(left.Equals(right)); - ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); -} +// ---------------------------------------------------------------------- +// String builder tests -class TestBinaryBuilder : public TestBuilder { +template +class TestStringBuilder : public TestBuilder { public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + void SetUp() { TestBuilder::SetUp(); - builder_.reset(new BinaryBuilder(pool_)); + builder_.reset(new BuilderType(pool_)); } void Done() { std::shared_ptr out; FinishAndCheckPadding(builder_.get(), &out); - result_ = std::dynamic_pointer_cast(out); + result_ = std::dynamic_pointer_cast(out); ASSERT_OK(ValidateArray(*result_)); } - protected: - std::unique_ptr builder_; - std::shared_ptr result_; -}; - -TEST_F(TestBinaryBuilder, TestScalarAppend) { - std::vector strings = {"", "bb", "a", "", "ccc"}; - std::vector is_valid = {1, 1, 1, 0, 1}; + void TestScalarAppend() { + std::vector strings = {"", "bb", "a", "", "ccc"}; + std::vector is_valid = {1, 1, 1, 0, 1}; - int N = static_cast(strings.size()); - int reps = 10; + int N = static_cast(strings.size()); + int reps = 10; - for (int j = 0; j < reps; ++j) { - for (int i = 0; i < N; ++i) { - if (!is_valid[i]) { - ASSERT_OK(builder_->AppendNull()); - } else { - ASSERT_OK(builder_->Append(strings[i])); + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (!is_valid[i]) { + ASSERT_OK(builder_->AppendNull()); + } else { + ASSERT_OK(builder_->Append(strings[i])); + } } } - } - Done(); - ASSERT_OK(ValidateArray(*result_)); - ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->value_data()->size()); - - CheckStringArray(*result_, strings, is_valid, reps); -} - -TEST_F(TestBinaryBuilder, TestAppendNulls) { - ASSERT_OK(builder_->Append("bow")); - ASSERT_OK(builder_->AppendNulls(3)); - ASSERT_OK(builder_->Append("arrow")); - Done(); - ASSERT_OK(ValidateArray(*result_)); + Done(); - ASSERT_EQ(5, result_->length()); - ASSERT_EQ(3, result_->null_count()); - ASSERT_EQ(8, result_->value_data()->size()); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->value_data()->size()); - CheckStringArray(*result_, {"bow", "", "", "", "arrow"}, {1, 0, 0, 0, 1}); -} + CheckStringArray(*result_, strings, is_valid, reps); + } -TEST_F(TestBinaryBuilder, TestScalarAppendUnsafe) { - std::vector strings = {"", "bb", "a", "", "ccc"}; - std::vector is_valid = {1, 1, 1, 0, 1}; + void TestScalarAppendUnsafe() { + std::vector strings = {"", "bb", "a", "", "ccc"}; + std::vector is_valid = {1, 1, 1, 0, 1}; - int N = static_cast(strings.size()); - int reps = 13; - int total_length = 0; - for (auto&& s : strings) total_length += static_cast(s.size()); + int N = static_cast(strings.size()); + int reps = 13; + int64_t total_length = 0; + for (const auto& s : strings) { + total_length += static_cast(s.size()); + } - ASSERT_OK(builder_->Reserve(N * reps)); - ASSERT_OK(builder_->ReserveData(total_length * reps)); + ASSERT_OK(builder_->Reserve(N * reps)); + ASSERT_OK(builder_->ReserveData(total_length * reps)); - for (int j = 0; j < reps; ++j) { - for (int i = 0; i < N; ++i) { - if (!is_valid[i]) { - builder_->UnsafeAppendNull(); - } else { - builder_->UnsafeAppend(strings[i]); + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (!is_valid[i]) { + builder_->UnsafeAppendNull(); + } else { + builder_->UnsafeAppend(strings[i]); + } } } - } - ASSERT_EQ(builder_->value_data_length(), total_length * reps); - Done(); - ASSERT_OK(ValidateArray(*result_)); - ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * total_length, result_->value_data()->size()); - - CheckStringArray(*result_, strings, is_valid, reps); -} + ASSERT_EQ(builder_->value_data_length(), total_length * reps); + Done(); -TEST_F(TestBinaryBuilder, TestCapacityReserve) { - std::vector strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", - "dddddddddd"}; - int N = static_cast(strings.size()); - int reps = 15; - int64_t length = 0; - int64_t capacity = 1000; - int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); + ASSERT_OK(ValidateArray(*result_)); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * total_length, result_->value_data()->size()); - ASSERT_OK(builder_->ReserveData(capacity)); + CheckStringArray(*result_, strings, is_valid, reps); + } - ASSERT_EQ(length, builder_->value_data_length()); - ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + void TestVectorAppend() { + std::vector strings = {"", "bb", "a", "", "ccc"}; + std::vector valid_bytes = {1, 1, 1, 0, 1}; - for (int j = 0; j < reps; ++j) { - for (int i = 0; i < N; ++i) { - ASSERT_OK(builder_->Append(strings[i])); - length += static_cast(strings[i].size()); + int N = static_cast(strings.size()); + int reps = 1000; - ASSERT_EQ(length, builder_->value_data_length()); - ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + for (int j = 0; j < reps; ++j) { + ASSERT_OK(builder_->AppendValues(strings, valid_bytes.data())); } + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->value_data()->size()); + + CheckStringArray(*result_, strings, valid_bytes, reps); } - int extra_capacity = 500; - expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); + void TestAppendCStringsWithValidBytes() { + const char* strings[] = {nullptr, "aaa", nullptr, "ignored", ""}; + std::vector valid_bytes = {1, 1, 1, 0, 1}; - ASSERT_OK(builder_->ReserveData(extra_capacity)); + int N = static_cast(sizeof(strings) / sizeof(strings[0])); + int reps = 1000; - ASSERT_EQ(length, builder_->value_data_length()); - int64_t actual_capacity = builder_->value_data_capacity(); - ASSERT_GE(actual_capacity, expected_capacity); - ASSERT_EQ(actual_capacity & 63, 0); + for (int j = 0; j < reps; ++j) { + ASSERT_OK(builder_->AppendValues(strings, N, valid_bytes.data())); + } + Done(); - Done(); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps * 3, result_->null_count()); + ASSERT_EQ(reps * 3, result_->value_data()->size()); - ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(0, result_->null_count()); - ASSERT_EQ(reps * 40, result_->value_data()->size()); + CheckStringArray(*result_, {"", "aaa", "", "", ""}, {0, 1, 0, 0, 1}, reps); + } - // Capacity is shrunk after `Finish` - ASSERT_EQ(640, result_->value_data()->capacity()); -} + void TestAppendCStringsWithoutValidBytes() { + const char* strings[] = {"", "bb", "a", nullptr, "ccc"}; -TEST_F(TestBinaryBuilder, TestZeroLength) { - // All buffers are null - Done(); -} + int N = static_cast(sizeof(strings) / sizeof(strings[0])); + int reps = 1000; -// ---------------------------------------------------------------------- -// Slice tests + for (int j = 0; j < reps; ++j) { + ASSERT_OK(builder_->AppendValues(strings, N)); + } + Done(); -template -void CheckSliceEquality() { - using Traits = TypeTraits; - using BuilderType = typename Traits::BuilderType; + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->value_data()->size()); - BuilderType builder; + CheckStringArray(*result_, {"", "bb", "a", "", "ccc"}, {1, 1, 1, 0, 1}, reps); + } - std::vector strings = {"foo", "", "bar", "baz", "qux", ""}; - std::vector is_null = {0, 1, 0, 1, 0, 0}; + void TestCapacityReserve() { + std::vector strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", + "dddddddddd"}; + int N = static_cast(strings.size()); + int reps = 15; + int64_t length = 0; + int64_t capacity = 1000; + int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); - int N = static_cast(strings.size()); - int reps = 10; + ASSERT_OK(builder_->ReserveData(capacity)); - for (int j = 0; j < reps; ++j) { - for (int i = 0; i < N; ++i) { - if (is_null[i]) { - ASSERT_OK(builder.AppendNull()); - } else { - ASSERT_OK(builder.Append(strings[i])); + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + ASSERT_OK(builder_->Append(strings[i])); + length += static_cast(strings[i].size()); + + ASSERT_EQ(length, builder_->value_data_length()); + ASSERT_EQ(expected_capacity, builder_->value_data_capacity()); } } - } - std::shared_ptr array; - FinishAndCheckPadding(&builder, &array); + int extra_capacity = 500; + expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); + + ASSERT_OK(builder_->ReserveData(extra_capacity)); - std::shared_ptr slice, slice2; + ASSERT_EQ(length, builder_->value_data_length()); + int64_t actual_capacity = builder_->value_data_capacity(); + ASSERT_GE(actual_capacity, expected_capacity); + ASSERT_EQ(actual_capacity & 63, 0); - slice = array->Slice(5); - slice2 = array->Slice(5); - ASSERT_EQ(N * reps - 5, slice->length()); + Done(); - ASSERT_TRUE(slice->Equals(slice2)); - ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice)); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(0, result_->null_count()); + ASSERT_EQ(reps * 40, result_->value_data()->size()); + } + + void TestZeroLength() { + // All buffers are null + Done(); + ASSERT_EQ(result_->length(), 0); + ASSERT_EQ(result_->null_count(), 0); + } - // Chained slices - slice2 = array->Slice(2)->Slice(3); - ASSERT_TRUE(slice->Equals(slice2)); + protected: + std::unique_ptr builder_; + std::shared_ptr result_; +}; - slice = array->Slice(5, 20); - slice2 = array->Slice(5, 20); - ASSERT_EQ(20, slice->length()); +TYPED_TEST_CASE(TestStringBuilder, StringTypes); - ASSERT_TRUE(slice->Equals(slice2)); - ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice)); +TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->TestScalarAppend(); } - ASSERT_OK(builder.Append("a")); - for (int j = 0; j < reps; ++j) { - ASSERT_OK(builder.Append("")); - } - FinishAndCheckPadding(&builder, &array); - slice = array->Slice(1); +TYPED_TEST(TestStringBuilder, TestScalarAppendUnsafe) { this->TestScalarAppendUnsafe(); } - for (int j = 0; j < reps; ++j) { - ASSERT_OK(builder.Append("")); - } - FinishAndCheckPadding(&builder, &array); +TYPED_TEST(TestStringBuilder, TestVectorAppend) { this->TestVectorAppend(); } - AssertArraysEqual(*slice, *array); +TYPED_TEST(TestStringBuilder, TestAppendCStringsWithValidBytes) { + this->TestAppendCStringsWithValidBytes(); } -TEST_F(TestBinaryArray, TestSliceEquality) { CheckSliceEquality(); } +TYPED_TEST(TestStringBuilder, TestAppendCStringsWithoutValidBytes) { + this->TestAppendCStringsWithoutValidBytes(); +} -TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality(); } +TYPED_TEST(TestStringBuilder, TestCapacityReserve) { this->TestCapacityReserve(); } -TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } +TYPED_TEST(TestStringBuilder, TestZeroLength) { this->TestZeroLength(); } // ---------------------------------------------------------------------- // ChunkedBinaryBuilder tests diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 5f76f083968..0b7d8f170cb 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -386,31 +386,26 @@ BinaryArray::BinaryArray(const std::shared_ptr& data) { SetData(data); } -void BinaryArray::SetData(const std::shared_ptr& data) { - ARROW_CHECK_EQ(data->buffers.size(), 3); - auto value_offsets = data->buffers[1]; - auto value_data = data->buffers[2]; - this->Array::SetData(data); - raw_data_ = value_data == nullptr ? nullptr : value_data->data(); - raw_value_offsets_ = value_offsets == nullptr - ? nullptr - : reinterpret_cast(value_offsets->data()); -} - BinaryArray::BinaryArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap, int64_t null_count, - int64_t offset) - : BinaryArray(binary(), length, value_offsets, data, null_bitmap, null_count, - offset) {} - -BinaryArray::BinaryArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { - SetData(ArrayData::Make(type, length, {null_bitmap, value_offsets, data}, null_count, - offset)); + SetData(ArrayData::Make(binary(), length, {null_bitmap, value_offsets, data}, + null_count, offset)); +} + +LargeBinaryArray::LargeBinaryArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::LARGE_BINARY); + SetData(data); +} + +LargeBinaryArray::LargeBinaryArray(int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset) { + SetData(ArrayData::Make(large_binary(), length, {null_bitmap, value_offsets, data}, + null_count, offset)); } StringArray::StringArray(const std::shared_ptr& data) { @@ -421,8 +416,24 @@ StringArray::StringArray(const std::shared_ptr& data) { StringArray::StringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, - int64_t offset) - : BinaryArray(utf8(), length, value_offsets, data, null_bitmap, null_count, offset) {} + int64_t offset) { + SetData(ArrayData::Make(utf8(), length, {null_bitmap, value_offsets, data}, null_count, + offset)); +} + +LargeStringArray::LargeStringArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING); + SetData(data); +} + +LargeStringArray::LargeStringArray(int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, + int64_t null_count, int64_t offset) { + SetData(ArrayData::Make(large_utf8(), length, {null_bitmap, value_offsets, data}, + null_count, offset)); +} // ---------------------------------------------------------------------- // Fixed width binary @@ -1148,20 +1159,14 @@ struct ValidateVisitor { return ValidateOffsets(array); } - Status Visit(const ListArray& array) { - if (array.length() < 0) { - return Status::Invalid("Length was negative"); - } - - auto value_offsets = array.value_offsets(); - if (array.length() && !value_offsets) { - return Status::Invalid("value_offsets_ was null"); - } - if (value_offsets->size() / static_cast(sizeof(int32_t)) < array.length()) { - return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), - " isn't large enough for length: ", array.length()); + Status Visit(const LargeBinaryArray& array) { + if (array.data()->buffers.size() != 3) { + return Status::Invalid("number of buffers was != 3"); } + return ValidateOffsets(array); + } + Status Visit(const ListArray& array) { if (!array.values()) { return Status::Invalid("values was null"); } @@ -1181,19 +1186,6 @@ struct ValidateVisitor { } Status Visit(const MapArray& array) { - if (array.length() < 0) { - return Status::Invalid("Length was negative"); - } - - auto value_offsets = array.value_offsets(); - if (array.length() && !value_offsets) { - return Status::Invalid("value_offsets_ was null"); - } - if (value_offsets->size() / static_cast(sizeof(int32_t)) < array.length()) { - return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), - " isn't large enough for length: ", array.length()); - } - if (!array.keys()) { return Status::Invalid("keys was null"); } @@ -1224,9 +1216,6 @@ struct ValidateVisitor { } Status Visit(const FixedSizeListArray& array) { - if (array.length() < 0) { - return Status::Invalid("Length was negative"); - } if (!array.values()) { return Status::Invalid("values was null"); } @@ -1240,14 +1229,6 @@ struct ValidateVisitor { } Status Visit(const StructArray& array) { - if (array.length() < 0) { - return Status::Invalid("Length was negative"); - } - - if (array.null_count() > array.length()) { - return Status::Invalid("Null count exceeds the length of this struct"); - } - if (array.num_fields() > 0) { // Validate fields int64_t array_length = array.field(0)->length(); @@ -1274,16 +1255,7 @@ struct ValidateVisitor { return Status::OK(); } - Status Visit(const UnionArray& array) { - if (array.length() < 0) { - return Status::Invalid("Length was negative"); - } - - if (array.null_count() > array.length()) { - return Status::Invalid("Null count exceeds the length of this struct"); - } - return Status::OK(); - } + Status Visit(const UnionArray& array) { return Status::OK(); } Status Visit(const DictionaryArray& array) { Type::type index_type_id = array.indices()->type()->id(); @@ -1310,12 +1282,23 @@ struct ValidateVisitor { protected: template Status ValidateOffsets(ArrayType& array) { - int32_t prev_offset = array.value_offset(0); + using offset_type = typename ArrayType::offset_type; + + auto value_offsets = array.value_offsets(); + if (array.length() && !value_offsets) { + return Status::Invalid("value_offsets_ was null"); + } + if (value_offsets->size() / static_cast(sizeof(offset_type)) < array.length()) { + return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), + " isn't large enough for length: ", array.length()); + } + + auto prev_offset = array.value_offset(0); if (array.offset() == 0 && prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } for (int64_t i = 1; i <= array.length(); ++i) { - int32_t current_offset = array.value_offset(i); + auto current_offset = array.value_offset(i); if (array.IsNull(i - 1) && current_offset != prev_offset) { return Status::Invalid("Offset invariant failure at: ", i, " inconsistent value_offsets for null slot", @@ -1340,6 +1323,14 @@ Status ValidateArray(const Array& array) { const auto layout = type.layout(); const ArrayData& data = *array.data(); + if (array.length() < 0) { + return Status::Invalid("Array length is negative"); + } + + if (array.null_count() > array.length()) { + return Status::Invalid("Null count exceeds array length"); + } + if (data.buffers.size() != layout.bit_widths.size()) { return Status::Invalid("Expected ", layout.bit_widths.size(), " buffers in array " diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 599a6ea62af..e13088c65c7 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -492,6 +492,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { class ARROW_EXPORT ListArray : public Array { public: using TypeClass = ListType; + using offset_type = ListType::offset_type; explicit ListArray(const std::shared_ptr& data); @@ -635,24 +636,20 @@ class ARROW_EXPORT FixedSizeListArray : public Array { // ---------------------------------------------------------------------- // Binary and String -/// Concrete Array class for variable-size binary data -class ARROW_EXPORT BinaryArray : public FlatArray { +/// Base class for variable-sized binary arrays, regardless of offset size +/// and logical interpretation. +template +class BaseBinaryArray : public FlatArray { public: - using TypeClass = BinaryType; - - explicit BinaryArray(const std::shared_ptr& data); - - BinaryArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; /// Return the pointer to the given elements bytes // XXX should GetValue(int64_t i) return a string_view? - const uint8_t* GetValue(int64_t i, int32_t* out_length) const { + const uint8_t* GetValue(int64_t i, offset_type* out_length) const { // Account for base offset i += data_->offset; - const int32_t pos = raw_value_offsets_[i]; + const offset_type pos = raw_value_offsets_[i]; *out_length = raw_value_offsets_[i + 1] - pos; return raw_data_ + pos; } @@ -664,7 +661,7 @@ class ARROW_EXPORT BinaryArray : public FlatArray { util::string_view GetView(int64_t i) const { // Account for base offset i += data_->offset; - const int32_t pos = raw_value_offsets_[i]; + const offset_type pos = raw_value_offsets_[i]; return util::string_view(reinterpret_cast(raw_data_ + pos), raw_value_offsets_[i + 1] - pos); } @@ -681,31 +678,52 @@ class ARROW_EXPORT BinaryArray : public FlatArray { /// Note that this buffer does not account for any slice offset std::shared_ptr value_data() const { return data_->buffers[2]; } - const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } + const offset_type* raw_value_offsets() const { + return raw_value_offsets_ + data_->offset; + } // Neither of these functions will perform boundschecking - int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - int32_t value_length(int64_t i) const { + offset_type value_offset(int64_t i) const { + return raw_value_offsets_[i + data_->offset]; + } + offset_type value_length(int64_t i) const { i += data_->offset; return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } protected: // For subclasses - BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} + BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} - /// Protected method for constructors - void SetData(const std::shared_ptr& data); + // Protected method for constructors + void SetData(const std::shared_ptr& data) { + auto value_offsets = data->buffers[1]; + auto value_data = data->buffers[2]; + this->Array::SetData(data); + raw_data_ = value_data == NULLPTR ? NULLPTR : value_data->data(); + raw_value_offsets_ = + value_offsets == NULLPTR + ? NULLPTR + : reinterpret_cast(value_offsets->data()); + } - // Constructor to allow sub-classes/builders to substitute their own logical type - BinaryArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, + const offset_type* raw_value_offsets_; + const uint8_t* raw_data_; +}; + +/// Concrete Array class for variable-size binary data +class ARROW_EXPORT BinaryArray : public BaseBinaryArray { + public: + explicit BinaryArray(const std::shared_ptr& data); + + BinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); - const int32_t* raw_value_offsets_; - const uint8_t* raw_data_; + protected: + // For subclasses such as StringArray + BinaryArray() : BaseBinaryArray() {} }; /// Concrete Array class for variable-size string (utf-8) data @@ -721,6 +739,34 @@ class ARROW_EXPORT StringArray : public BinaryArray { int64_t null_count = kUnknownNullCount, int64_t offset = 0); }; +/// Concrete Array class for large variable-size binary data +class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { + public: + explicit LargeBinaryArray(const std::shared_ptr& data); + + LargeBinaryArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + protected: + // For subclasses such as LargeStringArray + LargeBinaryArray() : BaseBinaryArray() {} +}; + +/// Concrete Array class for large variable-size string (utf-8) data +class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { + public: + using TypeClass = LargeStringType; + + explicit LargeStringArray(const std::shared_ptr& data); + + LargeStringArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 818ad155996..b83897d7e19 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -43,173 +43,15 @@ using internal::checked_cast; // ---------------------------------------------------------------------- // String and binary -BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} - -BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} - -Status BinaryBuilder::Resize(int64_t capacity) { - if (capacity > kListMaximumElements) { - return Status::CapacityError( - "BinaryBuilder cannot reserve space for more then 2^31 - 1 child elements, got ", - capacity); - } - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); - return ArrayBuilder::Resize(capacity); -} - -Status BinaryBuilder::ReserveData(int64_t elements) { - const int64_t size = value_data_length() + elements; - ARROW_RETURN_IF( - size > kBinaryMemoryLimit, - Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary")); - - return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) - : Status::OK(); -} - -Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { - return Status::CapacityError("BinaryArray cannot contain more than ", - kBinaryMemoryLimit, " bytes, have ", num_bytes); -} - -Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { - // Write final offset (values length) - RETURN_NOT_OK(AppendNextOffset()); - - // These buffers' padding zeroed by BufferBuilder - std::shared_ptr offsets, value_data, null_bitmap; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); - RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - - *out = - ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, null_count_, 0); - Reset(); - return Status::OK(); -} - -void BinaryBuilder::Reset() { - ArrayBuilder::Reset(); - offsets_builder_.Reset(); - value_data_builder_.Reset(); -} - -const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { - const int32_t* offsets = offsets_builder_.data(); - int32_t offset = offsets[i]; - if (i == (length_ - 1)) { - *out_length = static_cast(value_data_builder_.length()) - offset; - } else { - *out_length = offsets[i + 1] - offset; - } - return value_data_builder_.data() + offset; -} - -util::string_view BinaryBuilder::GetView(int64_t i) const { - const int32_t* offsets = offsets_builder_.data(); - int32_t offset = offsets[i]; - int32_t value_length; - if (i == (length_ - 1)) { - value_length = static_cast(value_data_builder_.length()) - offset; - } else { - value_length = offsets[i + 1] - offset; - } - return util::string_view( - reinterpret_cast(value_data_builder_.data() + offset), value_length); -} +BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BaseBinaryBuilder(binary(), pool) {} StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} -Status StringBuilder::AppendValues(const std::vector& values, - const uint8_t* valid_bytes) { - std::size_t total_length = std::accumulate( - values.begin(), values.end(), 0ULL, - [](uint64_t sum, const std::string& str) { return sum + str.size(); }); - RETURN_NOT_OK(Reserve(values.size())); - RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); - RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); - - if (valid_bytes) { - for (std::size_t i = 0; i < values.size(); ++i) { - UnsafeAppendNextOffset(); - if (valid_bytes[i]) { - value_data_builder_.UnsafeAppend( - reinterpret_cast(values[i].data()), values[i].size()); - } - } - } else { - for (std::size_t i = 0; i < values.size(); ++i) { - UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend(reinterpret_cast(values[i].data()), - values[i].size()); - } - } +LargeBinaryBuilder::LargeBinaryBuilder(MemoryPool* pool) + : BaseBinaryBuilder(large_binary(), pool) {} - UnsafeAppendToBitmap(valid_bytes, values.size()); - return Status::OK(); -} - -Status StringBuilder::AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes) { - std::size_t total_length = 0; - std::vector value_lengths(length); - bool have_null_value = false; - for (int64_t i = 0; i < length; ++i) { - if (values[i]) { - auto value_length = strlen(values[i]); - value_lengths[i] = value_length; - total_length += value_length; - } else { - have_null_value = true; - } - } - RETURN_NOT_OK(Reserve(length)); - RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); - RETURN_NOT_OK(offsets_builder_.Reserve(length)); - - if (valid_bytes) { - int64_t valid_bytes_offset = 0; - for (int64_t i = 0; i < length; ++i) { - UnsafeAppendNextOffset(); - if (valid_bytes[i]) { - if (values[i]) { - value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), - value_lengths[i]); - } else { - UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); - UnsafeAppendToBitmap(false); - valid_bytes_offset = i + 1; - } - } - } - UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); - } else { - if (have_null_value) { - std::vector valid_vector(length, 0); - for (int64_t i = 0; i < length; ++i) { - UnsafeAppendNextOffset(); - if (values[i]) { - value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), - value_lengths[i]); - valid_vector[i] = 1; - } - } - UnsafeAppendToBitmap(valid_vector.data(), length); - } else { - for (int64_t i = 0; i < length; ++i) { - UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), - value_lengths[i]); - } - UnsafeAppendToBitmap(nullptr, length); - } - } - return Status::OK(); -} +LargeStringBuilder::LargeStringBuilder(MemoryPool* pool) + : LargeBinaryBuilder(large_utf8(), pool) {} // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 47d3bae4b89..7ae4d311de5 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -17,8 +17,11 @@ #pragma once +#include +#include #include #include +#include #include #include @@ -37,15 +40,16 @@ constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; // ---------------------------------------------------------------------- // Binary and String -/// \class BinaryBuilder -/// \brief Builder class for variable-length binary data -class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { +template +class BaseBinaryBuilder : public ArrayBuilder { public: - explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + using TypeClass = TYPE; + using offset_type = typename TypeClass::offset_type; - BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + BaseBinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} - Status Append(const uint8_t* value, int32_t length) { + Status Append(const uint8_t* value, offset_type length) { ARROW_RETURN_NOT_OK(Reserve(1)); ARROW_RETURN_NOT_OK(AppendNextOffset()); // Safety check for UBSAN. @@ -57,14 +61,22 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status Append(const char* value, offset_type length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(util::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { return AppendOverflow(num_bytes); } ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); } UnsafeAppendToBitmap(length, false); return Status::OK(); @@ -77,56 +89,182 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status Append(const char* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(util::string_view value) { - return Append(value.data(), static_cast(value.size())); - } - /// \brief Append without checking capacity /// /// Offsets and data should have been presized using Reserve() and /// ReserveData(), respectively. - void UnsafeAppend(const uint8_t* value, int32_t length) { + void UnsafeAppend(const uint8_t* value, offset_type length) { UnsafeAppendNextOffset(); value_data_builder_.UnsafeAppend(value, length); UnsafeAppendToBitmap(true); } - void UnsafeAppend(const char* value, int32_t length) { + void UnsafeAppend(const char* value, offset_type length) { UnsafeAppend(reinterpret_cast(value), length); } void UnsafeAppend(const std::string& value) { - UnsafeAppend(value.c_str(), static_cast(value.size())); + UnsafeAppend(value.c_str(), static_cast(value.size())); } void UnsafeAppend(util::string_view value) { - UnsafeAppend(value.data(), static_cast(value.size())); + UnsafeAppend(value.data(), static_cast(value.size())); } void UnsafeAppendNull() { const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); UnsafeAppendToBitmap(false); } - void Reset() override; - Status Resize(int64_t capacity) override; + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR) { + std::size_t total_length = std::accumulate( + values.begin(), values.end(), 0ULL, + [](uint64_t sum, const std::string& str) { return sum + str.size(); }); + ARROW_RETURN_NOT_OK(Reserve(values.size())); + ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); + ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); + + if (valid_bytes != NULLPTR) { + for (std::size_t i = 0; i < values.size(); ++i) { + UnsafeAppendNextOffset(); + if (valid_bytes[i]) { + value_data_builder_.UnsafeAppend( + reinterpret_cast(values[i].data()), values[i].size()); + } + } + } else { + for (std::size_t i = 0; i < values.size(); ++i) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend( + reinterpret_cast(values[i].data()), values[i].size()); + } + } + + UnsafeAppendToBitmap(valid_bytes, values.size()); + return Status::OK(); + } + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + std::size_t total_length = 0; + std::vector value_lengths(length); + bool have_null_value = false; + for (int64_t i = 0; i < length; ++i) { + if (values[i] != NULLPTR) { + auto value_length = strlen(values[i]); + value_lengths[i] = value_length; + total_length += value_length; + } else { + have_null_value = true; + } + } + ARROW_RETURN_NOT_OK(Reserve(length)); + ARROW_RETURN_NOT_OK(ReserveData(total_length)); + + if (valid_bytes) { + int64_t valid_bytes_offset = 0; + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + if (valid_bytes[i]) { + if (values[i]) { + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + } else { + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, + i - valid_bytes_offset); + UnsafeAppendToBitmap(false); + valid_bytes_offset = i + 1; + } + } + } + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); + } else { + if (have_null_value) { + std::vector valid_vector(length, 0); + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + if (values[i]) { + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + valid_vector[i] = 1; + } + } + UnsafeAppendToBitmap(valid_vector.data(), length); + } else { + for (int64_t i = 0; i < length; ++i) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); + } + UnsafeAppendToBitmap(NULLPTR, length); + } + } + return Status::OK(); + } + + void Reset() override { + ArrayBuilder::Reset(); + offsets_builder_.Reset(); + value_data_builder_.Reset(); + } + + Status Resize(int64_t capacity) override { + // XXX Why is this check necessary? There is no reason to disallow, say, + // binary arrays with more than 2**31 empty or null values. + if (capacity > memory_limit()) { + return Status::CapacityError("BinaryBuilder cannot reserve space for more than ", + memory_limit(), " child elements, got ", capacity); + } + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // One more than requested for offsets + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + return ArrayBuilder::Resize(capacity); + } /// \brief Ensures there is enough allocated capacity to append the indicated /// number of bytes to the value data buffer without additional allocations - Status ReserveData(int64_t elements); + Status ReserveData(int64_t elements) { + const int64_t size = value_data_length() + elements; + ARROW_RETURN_IF(size > memory_limit(), + Status::CapacityError("Cannot reserve capacity larger than ", + memory_limit(), " bytes")); + return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) + : Status::OK(); + } - Status FinishInternal(std::shared_ptr* out) override; + Status FinishInternal(std::shared_ptr* out) override { + // Write final offset (values length) + ARROW_RETURN_NOT_OK(AppendNextOffset()); - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond + // These buffers' padding zeroed by BufferBuilder + std::shared_ptr offsets, value_data, null_bitmap; + ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); + ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + *out = ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, + null_count_, 0); + Reset(); + return Status::OK(); + } /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } @@ -136,33 +274,70 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { /// Temporary access to a value. /// /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i, int32_t* out_length) const; + const uint8_t* GetValue(int64_t i, offset_type* out_length) const { + const offset_type* offsets = offsets_builder_.data(); + const auto offset = offsets[i]; + if (i == (length_ - 1)) { + *out_length = static_cast(value_data_builder_.length()) - offset; + } else { + *out_length = offsets[i + 1] - offset; + } + return value_data_builder_.data() + offset; + } /// Temporary access to a value. /// /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; + util::string_view GetView(int64_t i) const { + offset_type value_length; + const uint8_t* value_data = GetValue(i, &value_length); + return util::string_view(reinterpret_cast(value_data), value_length); + } protected: - TypedBufferBuilder offsets_builder_; + TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - Status AppendOverflow(int64_t num_bytes); + Status AppendOverflow(int64_t num_bytes) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", num_bytes); + } Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { return AppendOverflow(num_bytes); } - return offsets_builder_.Append(static_cast(num_bytes)); + return offsets_builder_.Append(static_cast(num_bytes)); } void UnsafeAppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + + // Cannot make this a static attribute because of linking issues + static constexpr int64_t memory_limit() { + return std::numeric_limits::max() - 1; } }; +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder { + public: + explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + protected: + using BaseBinaryBuilder::BaseBinaryBuilder; +}; + /// \class StringBuilder /// \brief Builder class for UTF8 strings class ARROW_EXPORT StringBuilder : public BinaryBuilder { @@ -170,36 +345,41 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { using BinaryBuilder::BinaryBuilder; explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - using BinaryBuilder::Append; - using BinaryBuilder::Reset; - using BinaryBuilder::UnsafeAppend; + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond - /// \brief Append a sequence of strings in one shot. - /// - /// \param[in] values a vector of strings - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const std::vector& values, - const uint8_t* valid_bytes = NULLPTR); + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; - /// \brief Append a sequence of nul-terminated strings in one shot. - /// If one of the values is NULL, it is processed as a null - /// value even if the corresponding valid_bytes entry is 1. - /// - /// \param[in] values a contiguous C array of nul-terminated char * - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); +/// \class LargeBinaryBuilder +/// \brief Builder class for large variable-length binary data +class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder { + public: + explicit LargeBinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); /// \cond FALSE using ArrayBuilder::Finish; /// \endcond - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + protected: + using BaseBinaryBuilder::BaseBinaryBuilder; +}; + +/// \class LargeStringBuilder +/// \brief Builder class for large UTF8 strings +class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { + public: + using LargeBinaryBuilder::LargeBinaryBuilder; + explicit LargeStringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } }; // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array/concatenate-test.cc b/cpp/src/arrow/array/concatenate-test.cc index cf105ceb65b..730b25ab822 100644 --- a/cpp/src/arrow/array/concatenate-test.cc +++ b/cpp/src/arrow/array/concatenate-test.cc @@ -48,10 +48,11 @@ class ConcatenateTest : public ::testing::Test { sizes_({0, 1, 2, 4, 16, 31, 1234}), null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} - std::vector Offsets(int32_t length, int32_t slice_count) { - std::vector offsets(static_cast(slice_count + 1)); + template + std::vector Offsets(int32_t length, int32_t slice_count) { + std::vector offsets(static_cast(slice_count + 1)); std::default_random_engine gen(seed_); - std::uniform_int_distribution dist(0, length); + std::uniform_int_distribution dist(0, length); std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); std::sort(offsets.begin(), offsets.end()); return offsets; @@ -85,7 +86,7 @@ class ConcatenateTest : public ::testing::Test { template void Check(ArrayFactory&& factory) { for (auto size : this->sizes_) { - auto offsets = this->Offsets(size, 3); + auto offsets = this->Offsets(size, 3); for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); @@ -146,16 +147,16 @@ TYPED_TEST(PrimitiveConcatenateTest, Primitives) { TEST_F(ConcatenateTest, StringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto char_array = this->GeneratePrimitive(values_size, null_probability); - std::shared_ptr offsets; - auto offsets_vector = this->Offsets(values_size, size); - // ensure the first offset is 0, which is expected for StringType - offsets_vector[0] = 0; - ASSERT_OK(CopyBufferFromVector(offsets_vector, default_memory_pool(), &offsets)); - *out = MakeArray(ArrayData::Make( - utf8(), size, - {char_array->data()->buffers[0], offsets, char_array->data()->buffers[1]})); + *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + ASSERT_OK(ValidateArray(**out)); + }); +} + +TEST_F(ConcatenateTest, LargeStringType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = + rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability); + ASSERT_OK(ValidateArray(**out)); }); } @@ -163,7 +164,7 @@ TEST_F(ConcatenateTest, ListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { auto values_size = size * 4; auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); + auto offsets_vector = this->Offsets(values_size, size); // ensure the first offset is 0, which is expected for ListType offsets_vector[0] = 0; std::shared_ptr offsets; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 60da0d3f856..a20b157acd5 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -184,14 +184,21 @@ class ConcatenateImpl { Status Visit(const BinaryType&) { std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, *offset_type), pool_, + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, sizeof(int32_t)), pool_, + &out_.buffers[1], &value_ranges)); + return ConcatenateBuffers(Buffers(2, value_ranges), pool_, &out_.buffers[2]); + } + + Status Visit(const LargeBinaryType&) { + std::vector value_ranges; + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, sizeof(int64_t)), pool_, &out_.buffers[1], &value_ranges)); return ConcatenateBuffers(Buffers(2, value_ranges), pool_, &out_.buffers[2]); } Status Visit(const ListType&) { std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, *offset_type), pool_, + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, sizeof(int32_t)), pool_, &out_.buffers[1], &value_ranges)); return ConcatenateImpl(ChildData(0, value_ranges), pool_) .Concatenate(out_.child_data[0].get()); @@ -277,13 +284,11 @@ class ConcatenateImpl { } // Gather the index-th buffer of each input into a vector. - // Buffers are assumed to contain elements of fixed.bit_width(), + // Buffers are assumed to contain elements of the given byte_width, // those elements are sliced with that input's offset and length. // Note that BufferVector will not contain the buffer of in_[i] if it's // nullptr. - BufferVector Buffers(size_t index, const FixedWidthType& fixed) { - DCHECK_EQ(fixed.bit_width() % 8, 0); - auto byte_width = fixed.bit_width() / 8; + BufferVector Buffers(size_t index, int byte_width) { BufferVector buffers; buffers.reserve(in_.size()); for (const ArrayData& array_data : in_) { @@ -296,6 +301,16 @@ class ConcatenateImpl { return buffers; } + // Gather the index-th buffer of each input into a vector. + // Buffers are assumed to contain elements of fixed.bit_width(), + // those elements are sliced with that input's offset and length. + // Note that BufferVector will not contain the buffer of in_[i] if it's + // nullptr. + BufferVector Buffers(size_t index, const FixedWidthType& fixed) { + DCHECK_EQ(fixed.bit_width() % 8, 0); + return Buffers(index, fixed.bit_width() / 8); + } + // Gather the index-th buffer of each input as a Bitmap // into a vector of Bitmaps. std::vector Bitmaps(size_t index) { @@ -328,15 +343,11 @@ class ConcatenateImpl { return child_data; } - static const std::shared_ptr offset_type; const std::vector& in_; MemoryPool* pool_; ArrayData out_; }; -const std::shared_ptr ConcatenateImpl::offset_type = - std::static_pointer_cast(int32()); - Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, std::shared_ptr* out) { if (arrays.size() == 0) { diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index cee443c4885..44b0d041be9 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -107,6 +107,8 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(DOUBLE, DoubleBuilder); BUILDER_CASE(STRING, StringBuilder); BUILDER_CASE(BINARY, BinaryBuilder); + BUILDER_CASE(LARGE_STRING, LargeStringBuilder); + BUILDER_CASE(LARGE_BINARY, LargeBinaryBuilder); BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); BUILDER_CASE(DECIMAL, Decimal128Builder); case Type::DICTIONARY: { diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 097bc8f7698..cb606e30b9b 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -144,8 +144,9 @@ class RangeEqualsVisitor { return Status::OK(); } - bool CompareBinaryRange(const BinaryArray& left) const { - const auto& right = checked_cast(right_); + template + bool CompareBinaryRange(const BinaryArrayType& left) const { + const auto& right = checked_cast(right_); for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { @@ -154,10 +155,10 @@ class RangeEqualsVisitor { return false; } if (is_null) continue; - const int32_t begin_offset = left.value_offset(i); - const int32_t end_offset = left.value_offset(i + 1); - const int32_t right_begin_offset = right.value_offset(o_i); - const int32_t right_end_offset = right.value_offset(o_i + 1); + const auto begin_offset = left.value_offset(i); + const auto end_offset = left.value_offset(i + 1); + const auto right_begin_offset = right.value_offset(o_i); + const auto right_end_offset = right.value_offset(o_i + 1); // Underlying can't be equal if the size isn't equal if (end_offset - begin_offset != right_end_offset - right_begin_offset) { return false; @@ -278,6 +279,11 @@ class RangeEqualsVisitor { return Status::OK(); } + Status Visit(const LargeBinaryArray& left) { + result_ = CompareBinaryRange(left); + return Status::OK(); + } + Status Visit(const FixedSizeBinaryArray& left) { const auto& right = checked_cast(right_); @@ -489,18 +495,21 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { template bool ValueOffsetsEqual(const ArrayType& left) { + using offset_type = typename ArrayType::offset_type; + const auto& right = checked_cast(right_); if (left.offset() == 0 && right.offset() == 0) { return left.value_offsets()->Equals(*right.value_offsets(), - (left.length() + 1) * sizeof(int32_t)); + (left.length() + 1) * sizeof(offset_type)); } else { // One of the arrays is sliced; logic is more complicated because the // value offsets are not both 0-based auto left_offsets = - reinterpret_cast(left.value_offsets()->data()) + left.offset(); + reinterpret_cast(left.value_offsets()->data()) + + left.offset(); auto right_offsets = - reinterpret_cast(right.value_offsets()->data()) + + reinterpret_cast(right.value_offsets()->data()) + right.offset(); for (int64_t i = 0; i < left.length() + 1; ++i) { @@ -512,10 +521,11 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { } } - bool CompareBinary(const BinaryArray& left) { - const auto& right = checked_cast(right_); + template + bool CompareBinary(const BinaryArrayType& left) { + const auto& right = checked_cast(right_); - bool equal_offsets = ValueOffsetsEqual(left); + bool equal_offsets = ValueOffsetsEqual(left); if (!equal_offsets) { return false; } @@ -544,8 +554,8 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { } } else { // ARROW-537: Only compare data in non-null slots - const int32_t* left_offsets = left.raw_value_offsets(); - const int32_t* right_offsets = right.raw_value_offsets(); + auto left_offsets = left.raw_value_offsets(); + auto right_offsets = right.raw_value_offsets(); for (int64_t i = 0; i < left.length(); ++i) { if (left.IsNull(i)) { continue; @@ -564,6 +574,11 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor { return Status::OK(); } + Status Visit(const LargeBinaryArray& left) { + result_ = CompareBinary(left); + return Status::OK(); + } + Status Visit(const ListArray& left) { const auto& right = checked_cast(right_); bool equal_offsets = ValueOffsetsEqual(left); @@ -822,6 +837,15 @@ class ScalarEqualsVisitor { return Status::OK(); } + template + typename std::enable_if::value, Status>::type + Visit(const T& left_) { + const auto& left = checked_cast(left_); + const auto& right = checked_cast(right_); + result_ = internal::SharedPtrEquals(left.value, right.value); + return Status::OK(); + } + Status Visit(const Decimal128Scalar& left) { const auto& right = checked_cast(right_); result_ = left.value == right.value; diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 6bf4f941755..80538f20e4f 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -52,6 +52,8 @@ namespace compute { using internal::checked_cast; +static constexpr const char* kInvalidUtf8 = "\xa0\xa1"; + static std::vector> kNumericTypes = { uint8(), int8(), uint16(), int16(), uint32(), int32(), uint64(), int64(), float32(), float64()}; @@ -131,6 +133,132 @@ class TestCast : public ComputeFixture, public TestBase { CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); } } + + template + void TestCastBinaryToString() { + CastOptions options; + auto src_type = TypeTraits::type_singleton(); + auto dest_type = TypeTraits::type_singleton(); + + // All valid except the last one + std::vector all = {1, 1, 1, 1, 1}; + std::vector valid = {1, 1, 1, 1, 0}; + std::vector strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; + + std::shared_ptr array; + + // Should accept when invalid but null. + ArrayFromVector(src_type, valid, strings, &array); + CheckZeroCopy(*array, dest_type); + + // Should refuse due to invalid utf8 payload + CheckFails(src_type, strings, all, dest_type, options); + + // Should accept due to option override + options.allow_invalid_utf8 = true; + CheckCase( + src_type, strings, all, dest_type, strings, options); + } + + template + void TestCastStringToNumber() { + CastOptions options; + auto src_type = TypeTraits::type_singleton(); + + std::vector is_valid = {true, false, true, true, true}; + + // string to int + std::vector v_int = {"0", "1", "127", "-1", "0"}; + std::vector e_int8 = {0, 1, 127, -1, 0}; + std::vector e_int16 = {0, 1, 127, -1, 0}; + std::vector e_int32 = {0, 1, 127, -1, 0}; + std::vector e_int64 = {0, 1, 127, -1, 0}; + CheckCase(src_type, v_int, is_valid, + int8(), e_int8, options); + CheckCase(src_type, v_int, is_valid, + int16(), e_int16, options); + CheckCase(src_type, v_int, is_valid, + int32(), e_int32, options); + CheckCase(src_type, v_int, is_valid, + int64(), e_int64, options); + + v_int = {"2147483647", "0", "-2147483648", "0", "0"}; + e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; + CheckCase(src_type, v_int, is_valid, + int32(), e_int32, options); + v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; + e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; + CheckCase(src_type, v_int, is_valid, + int64(), e_int64, options); + + // string to uint + std::vector v_uint = {"0", "1", "127", "255", "0"}; + std::vector e_uint8 = {0, 1, 127, 255, 0}; + std::vector e_uint16 = {0, 1, 127, 255, 0}; + std::vector e_uint32 = {0, 1, 127, 255, 0}; + std::vector e_uint64 = {0, 1, 127, 255, 0}; + CheckCase(src_type, v_uint, is_valid, + uint8(), e_uint8, options); + CheckCase(src_type, v_uint, is_valid, + uint16(), e_uint16, options); + CheckCase(src_type, v_uint, is_valid, + uint32(), e_uint32, options); + CheckCase(src_type, v_uint, is_valid, + uint64(), e_uint64, options); + + v_uint = {"4294967295", "0", "0", "0", "0"}; + e_uint32 = {4294967295, 0, 0, 0, 0}; + CheckCase(src_type, v_uint, is_valid, + uint32(), e_uint32, options); + v_uint = {"18446744073709551615", "0", "0", "0", "0"}; + e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; + CheckCase(src_type, v_uint, is_valid, + uint64(), e_uint64, options); + + // string to float + std::vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; + std::vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; + std::vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; + CheckCase(src_type, v_float, is_valid, + float32(), e_float, options); + CheckCase(src_type, v_float, is_valid, + float64(), e_double, options); + + // Test that casting is locale-independent + auto global_locale = std::locale(); + try { + // French locale uses the comma as decimal point + std::locale::global(std::locale("fr_FR.UTF-8")); + } catch (std::runtime_error&) { + // Locale unavailable, ignore + } + CheckCase(src_type, v_float, is_valid, + float32(), e_float, options); + CheckCase(src_type, v_float, is_valid, + float64(), e_double, options); + std::locale::global(global_locale); + } + + template + void TestCastStringToTimestamp() { + CastOptions options; + auto src_type = TypeTraits::type_singleton(); + + std::vector is_valid = {true, false, true}; + std::vector strings = {"1970-01-01", "xxx", "2000-02-29"}; + + auto type = timestamp(TimeUnit::SECOND); + std::vector e = {0, 0, 951782400}; + CheckCase( + src_type, strings, is_valid, type, e, options); + + type = timestamp(TimeUnit::MICRO); + e = {0, 0, 951782400000000LL}; + CheckCase( + src_type, strings, is_valid, type, e, options); + + // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc + } }; TEST_F(TestCast, SameTypeZeroCopy) { @@ -922,6 +1050,10 @@ TEST_F(TestCast, StringToBoolean) { e, options); CheckCase(utf8(), v2, is_valid, boolean(), e, options); + + // Same with LargeStringType + CheckCase(large_utf8(), v1, is_valid, + boolean(), e, options); } TEST_F(TestCast, StringToBooleanErrors) { @@ -931,84 +1063,13 @@ TEST_F(TestCast, StringToBooleanErrors) { CheckFails(utf8(), {"false "}, is_valid, boolean(), options); CheckFails(utf8(), {"T"}, is_valid, boolean(), options); + CheckFails(large_utf8(), {"T"}, is_valid, boolean(), + options); } -TEST_F(TestCast, StringToNumber) { - CastOptions options; +TEST_F(TestCast, StringToNumber) { TestCastStringToNumber(); } - std::vector is_valid = {true, false, true, true, true}; - - // string to int - std::vector v_int = {"0", "1", "127", "-1", "0"}; - std::vector e_int8 = {0, 1, 127, -1, 0}; - std::vector e_int16 = {0, 1, 127, -1, 0}; - std::vector e_int32 = {0, 1, 127, -1, 0}; - std::vector e_int64 = {0, 1, 127, -1, 0}; - CheckCase(utf8(), v_int, is_valid, int8(), - e_int8, options); - CheckCase(utf8(), v_int, is_valid, int16(), - e_int16, options); - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - v_int = {"2147483647", "0", "-2147483648", "0", "0"}; - e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; - e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - // string to uint - std::vector v_uint = {"0", "1", "127", "255", "0"}; - std::vector e_uint8 = {0, 1, 127, 255, 0}; - std::vector e_uint16 = {0, 1, 127, 255, 0}; - std::vector e_uint32 = {0, 1, 127, 255, 0}; - std::vector e_uint64 = {0, 1, 127, 255, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint8(), e_uint8, options); - CheckCase(utf8(), v_uint, is_valid, - uint16(), e_uint16, options); - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - v_uint = {"4294967295", "0", "0", "0", "0"}; - e_uint32 = {4294967295, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - v_uint = {"18446744073709551615", "0", "0", "0", "0"}; - e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - // string to float - std::vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; - std::vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; - std::vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - - // Test that casting is locale-independent - auto global_locale = std::locale(); - try { - // French locale uses the comma as decimal point - std::locale::global(std::locale("fr_FR.UTF-8")); - } catch (std::runtime_error&) { - // Locale unavailable, ignore - } - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - std::locale::global(global_locale); -} +TEST_F(TestCast, LargeStringToNumber) { TestCastStringToNumber(); } TEST_F(TestCast, StringToNumberErrors) { CastOptions options; @@ -1027,24 +1088,9 @@ TEST_F(TestCast, StringToNumberErrors) { CheckFails(utf8(), {"z"}, is_valid, float32(), options); } -TEST_F(TestCast, StringToTimestamp) { - CastOptions options; - - std::vector is_valid = {true, false, true}; - std::vector strings = {"1970-01-01", "xxx", "2000-02-29"}; +TEST_F(TestCast, StringToTimestamp) { TestCastStringToTimestamp(); } - auto type = timestamp(TimeUnit::SECOND); - std::vector e = {0, 0, 951782400}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - type = timestamp(TimeUnit::MICRO); - e = {0, 0, 951782400000000LL}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc -} +TEST_F(TestCast, LargeStringToTimestamp) { TestCastStringToTimestamp(); } TEST_F(TestCast, StringToTimestampErrors) { CastOptions options; @@ -1058,29 +1104,10 @@ TEST_F(TestCast, StringToTimestampErrors) { } } -constexpr const char* kInvalidUtf8 = "\xa0\xa1"; - -TEST_F(TestCast, BinaryToString) { - CastOptions options; - - // All valid except the last one - std::vector all = {1, 1, 1, 1, 1}; - std::vector valid = {1, 1, 1, 1, 0}; - std::vector strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; - - std::shared_ptr array; - - // Should accept when invalid but null. - ArrayFromVector(binary(), valid, strings, &array); - CheckZeroCopy(*array, utf8()); - - // Should refuse due to invalid utf8 payload - CheckFails(binary(), strings, all, utf8(), options); +TEST_F(TestCast, BinaryToString) { TestCastBinaryToString(); } - // Should accept due to option override - options.allow_invalid_utf8 = true; - CheckCase(binary(), strings, all, - utf8(), strings, options); +TEST_F(TestCast, LargeBinaryToLargeString) { + TestCastBinaryToString(); } TEST_F(TestCast, ListToList) { diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 88a4f30873f..a8b66159997 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -905,13 +905,15 @@ struct CastFunctor { // ---------------------------------------------------------------------- // String to Number -template -struct CastFunctor> { +template +struct CastFunctor::value && + is_number_type::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { using out_type = typename O::c_type; - StringArray input_array(input.Copy()); + typename TypeTraits::ArrayType input_array(input.Copy()); auto out_data = output->GetMutableValues(1); internal::StringConverter converter; @@ -933,15 +935,15 @@ struct CastFunctor> { // ---------------------------------------------------------------------- // String to Boolean -template -struct CastFunctor::value>::type> { +template +struct CastFunctor::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - StringArray input_array(input.Copy()); + typename TypeTraits::ArrayType input_array(input.Copy()); internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset, input.length); - internal::StringConverter converter; + internal::StringConverter converter; for (int64_t i = 0; i < input.length; ++i) { if (input_array.IsNull(i)) { @@ -972,13 +974,14 @@ struct CastFunctor -struct CastFunctor { +template +struct CastFunctor::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { using out_type = TimestampType::c_type; - StringArray input_array(input.Copy()); + typename TypeTraits::ArrayType input_array(input.Copy()); auto out_data = output->GetMutableValues(1); internal::StringConverter converter(output->type); @@ -1001,47 +1004,51 @@ struct CastFunctor { // Binary to String // -template -struct CastFunctor::value>::type> { +#if defined(_MSC_VER) +// Silence warning: """'visitor': unreferenced local variable""" +#pragma warning(push) +#pragma warning(disable : 4101) +#endif + +template +struct BinaryToStringSameWidthCastFunctor { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - BinaryArray binary(input.Copy()); + if (!options.allow_invalid_utf8) { + util::InitializeUTF8(); - if (options.allow_invalid_utf8) { - ZeroCopyData(input, output); - return; + ArrayDataVisitor visitor; + Status st = visitor.Visit(input, this); + if (!st.ok()) { + ctx->SetStatus(st); + return; + } } + ZeroCopyData(input, output); + } - util::InitializeUTF8(); - - if (binary.null_count() != 0) { - for (int64_t i = 0; i < input.length; i++) { - if (binary.IsNull(i)) { - continue; - } - - const auto str = binary.GetView(i); - if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { - ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); - return; - } - } + Status VisitNull() { return Status::OK(); } - } else { - for (int64_t i = 0; i < input.length; i++) { - const auto str = binary.GetView(i); - if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { - ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); - return; - } - } + Status VisitValue(util::string_view str) { + if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { + return Status::Invalid("Invalid UTF8 payload"); } - - ZeroCopyData(input, output); + return Status::OK(); } }; +template <> +struct CastFunctor + : public BinaryToStringSameWidthCastFunctor {}; + +template <> +struct CastFunctor + : public BinaryToStringSameWidthCastFunctor {}; + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + // ---------------------------------------------------------------------- typedef std::function out_ty CAST_FUNCTION_CASE(TimestampType); CAST_FUNCTION_CASE(BinaryType); CAST_FUNCTION_CASE(StringType); + CAST_FUNCTION_CASE(LargeBinaryType); + CAST_FUNCTION_CASE(LargeStringType); CAST_FUNCTION_CASE(DictionaryType); case Type::LIST: RETURN_NOT_OK(GetListCastFunc(in_type, std::move(out_type), options, kernel)); diff --git a/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h b/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h index 77334af36b5..fb82067bb02 100644 --- a/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h +++ b/cpp/src/arrow/compute/kernels/generated/cast-codegen-internal.h @@ -171,6 +171,9 @@ #define BINARY_CASES(TEMPLATE) \ TEMPLATE(BinaryType, StringType) +#define LARGEBINARY_CASES(TEMPLATE) \ + TEMPLATE(LargeBinaryType, LargeStringType) + #define STRING_CASES(TEMPLATE) \ TEMPLATE(StringType, BooleanType) \ TEMPLATE(StringType, UInt8Type) \ @@ -185,6 +188,20 @@ TEMPLATE(StringType, DoubleType) \ TEMPLATE(StringType, TimestampType) +#define LARGESTRING_CASES(TEMPLATE) \ + TEMPLATE(LargeStringType, BooleanType) \ + TEMPLATE(LargeStringType, UInt8Type) \ + TEMPLATE(LargeStringType, Int8Type) \ + TEMPLATE(LargeStringType, UInt16Type) \ + TEMPLATE(LargeStringType, Int16Type) \ + TEMPLATE(LargeStringType, UInt32Type) \ + TEMPLATE(LargeStringType, Int32Type) \ + TEMPLATE(LargeStringType, UInt64Type) \ + TEMPLATE(LargeStringType, Int64Type) \ + TEMPLATE(LargeStringType, FloatType) \ + TEMPLATE(LargeStringType, DoubleType) \ + TEMPLATE(LargeStringType, TimestampType) + #define DICTIONARY_CASES(TEMPLATE) \ TEMPLATE(DictionaryType, UInt8Type) \ TEMPLATE(DictionaryType, Int8Type) \ diff --git a/cpp/src/arrow/compute/kernels/generated/codegen.py b/cpp/src/arrow/compute/kernels/generated/codegen.py index 04fc38618bd..c9db7eaa0dc 100644 --- a/cpp/src/arrow/compute/kernels/generated/codegen.py +++ b/cpp/src/arrow/compute/kernels/generated/codegen.py @@ -85,7 +85,9 @@ def generate(self): CastCodeGenerator('Timestamp', ['Date32', 'Date64', 'Timestamp'], parametric=True), CastCodeGenerator('Binary', ['String']), + CastCodeGenerator('LargeBinary', ['LargeString']), CastCodeGenerator('String', NUMERIC_TYPES + ['Timestamp']), + CastCodeGenerator('LargeString', NUMERIC_TYPES + ['Timestamp']), CastCodeGenerator('Dictionary', INTEGER_TYPES + FLOATING_TYPES + DATE_TIME_TYPES + ['Null', 'Binary', 'FixedSizeBinary', 'String', diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc index a5e4c0372c4..53176ff0a1a 100644 --- a/cpp/src/arrow/csv/converter-test.cc +++ b/cpp/src/arrow/csv/converter-test.cc @@ -30,6 +30,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -118,11 +119,17 @@ void AssertConversionError(const std::shared_ptr& type, ////////////////////////////////////////////////////////////////////////// // Test functions begin here -TEST(BinaryConversion, Basics) { - AssertConversion(binary(), {"ab,cdé\n", ",\xffgh\n"}, - {{"ab", ""}, {"cdé", "\xffgh"}}); +template +static void TestBinaryConversionBasics() { + auto type = TypeTraits::type_singleton(); + AssertConversion(type, {"ab,cdé\n", ",\xffgh\n"}, + {{"ab", ""}, {"cdé", "\xffgh"}}); } +TEST(BinaryConversion, Basics) { TestBinaryConversionBasics(); } + +TEST(LargeBinaryConversion, Basics) { TestBinaryConversionBasics(); } + TEST(BinaryConversion, Nulls) { AssertConversion(binary(), {"ab,N/A\n", "NULL,\n"}, {{"ab", "NULL"}, {"N/A", ""}}, @@ -135,16 +142,22 @@ TEST(BinaryConversion, Nulls) { {{true, false}, {false, false}}, options); } -TEST(StringConversion, Basics) { - AssertConversion(utf8(), {"ab,cdé\n", ",gh\n"}, - {{"ab", ""}, {"cdé", "gh"}}); +template +static void TestStringConversionBasics() { + auto type = TypeTraits::type_singleton(); + AssertConversion(type, {"ab,cdé\n", ",gh\n"}, + {{"ab", ""}, {"cdé", "gh"}}); auto options = ConvertOptions::Defaults(); options.check_utf8 = false; - AssertConversion(utf8(), {"ab,cdé\n", ",\xffgh\n"}, - {{"ab", ""}, {"cdé", "\xffgh"}}, options); + AssertConversion(type, {"ab,cdé\n", ",\xffgh\n"}, + {{"ab", ""}, {"cdé", "\xffgh"}}, options); } +TEST(StringConversion, Basics) { TestStringConversionBasics(); } + +TEST(LargeStringConversion, Basics) { TestStringConversionBasics(); } + TEST(StringConversion, Nulls) { AssertConversion(utf8(), {"ab,N/A\n", "NULL,\n"}, {{"ab", "NULL"}, {"N/A", ""}}, @@ -157,11 +170,17 @@ TEST(StringConversion, Nulls) { {{true, false}, {false, false}}, options); } -TEST(StringConversion, Errors) { +template +static void TestStringConversionErrors() { + auto type = TypeTraits::type_singleton(); // Invalid UTF8 in column 0 - AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0}); + AssertConversionError(type, {"ab,cdé\n", "\xff,gh\n"}, {0}); } +TEST(StringConversion, Errors) { TestStringConversionErrors(); } + +TEST(LargeStringConversion, Errors) { TestStringConversionErrors(); } + TEST(FixedSizeBinaryConversion, Basics) { AssertConversion( fixed_size_binary(2), {"ab,cd\n", "gh,ij\n"}, {{"ab", "gh"}, {"cd", "ij"}}); diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 53495cf9bc3..1c61d3ccbc9 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -431,6 +431,7 @@ Status Converter::Make(const std::shared_ptr& type, CONVERTER_CASE(Type::BOOL, BooleanConverter) CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter)) + CONVERTER_CASE(Type::LARGE_BINARY, (VarSizeBinaryConverter)) CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) CONVERTER_CASE(Type::DECIMAL, DecimalConverter) @@ -442,6 +443,14 @@ Status Converter::Make(const std::shared_ptr& type, } break; + case Type::LARGE_STRING: + if (options.check_utf8) { + result = new VarSizeBinaryConverter(type, options, pool); + } else { + result = new VarSizeBinaryConverter(type, options, pool); + } + break; + default: { return Status::NotImplemented("CSV conversion to ", type->ToString(), " is not supported"); diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 7cd64c8d78a..8436bd205b5 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -367,6 +367,8 @@ class TableReader::TableReaderImpl { PRIMITIVE_CASE(DOUBLE, float64); PRIMITIVE_CASE(UTF8, utf8); PRIMITIVE_CASE(BINARY, binary); + PRIMITIVE_CASE(LARGE_UTF8, large_utf8); + PRIMITIVE_CASE(LARGE_BINARY, large_binary); default: return Status::Invalid("Unrecognized type"); } @@ -410,6 +412,10 @@ class TableReader::TableReaderImpl { int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int32_t)); buffers.push_back(SliceBuffer(buffer, offset, offsets_size)); offset += offsets_size; + } else if (is_large_binary_like(type->id())) { + int64_t offsets_size = GetOutputLength((meta->length() + 1) * sizeof(int64_t)); + buffers.push_back(SliceBuffer(buffer, offset, offsets_size)); + offset += offsets_size; } buffers.push_back(SliceBuffer(buffer, offset, buffer->size() - offset)); @@ -585,6 +591,10 @@ fbs::Type ToFlatbufferType(Type::type type) { return fbs::Type_UTF8; case Type::BINARY: return fbs::Type_BINARY; + case Type::LARGE_STRING: + return fbs::Type_LARGE_UTF8; + case Type::LARGE_BINARY: + return fbs::Type_LARGE_BINARY; case Type::DATE32: return fbs::Type_INT32; case Type::TIMESTAMP: @@ -644,7 +654,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { } Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { - if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()))) { + if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()) || + is_large_binary_like(values.type_id()))) { return Status::Invalid("Array is not primitive type: ", values.type()->ToString()); } @@ -659,6 +670,32 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { return Status::OK(); } + template + Status WriteBinaryArray(const ArrayType& values, ArrayMetadata* meta, + const uint8_t** values_buffer, int64_t* values_bytes, + int64_t* bytes_written) { + using offset_type = typename ArrayType::offset_type; + + int64_t offset_bytes = sizeof(offset_type) * (values.length() + 1); + + if (values.value_offsets()) { + *values_bytes = values.raw_value_offsets()[values.length()]; + + // Write the variable-length offsets + RETURN_NOT_OK(WritePadded( + stream_.get(), reinterpret_cast(values.raw_value_offsets()), + offset_bytes, bytes_written)); + } else { + RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, bytes_written)); + } + meta->total_bytes += *bytes_written; + + if (values.value_data()) { + *values_buffer = values.value_data()->data(); + } + return Status::OK(); + } + Status WriteArray(const Array& values, ArrayMetadata* meta) { RETURN_NOT_OK(CheckStarted()); RETURN_NOT_OK(LoadArrayMetadata(values, meta)); @@ -687,26 +724,11 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { const uint8_t* values_buffer = nullptr; if (is_binary_like(values.type_id())) { - const auto& bin_values = checked_cast(values); - - int64_t offset_bytes = sizeof(int32_t) * (values.length() + 1); - - if (bin_values.value_offsets()) { - values_bytes = bin_values.raw_value_offsets()[values.length()]; - - // Write the variable-length offsets - RETURN_NOT_OK( - WritePadded(stream_.get(), - reinterpret_cast(bin_values.raw_value_offsets()), - offset_bytes, &bytes_written)); - } else { - RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, &bytes_written)); - } - meta->total_bytes += bytes_written; - - if (bin_values.value_data()) { - values_buffer = bin_values.value_data()->data(); - } + RETURN_NOT_OK(WriteBinaryArray(checked_cast(values), meta, + &values_buffer, &values_bytes, &bytes_written)); + } else if (is_large_binary_like(values.type_id())) { + RETURN_NOT_OK(WriteBinaryArray(checked_cast(values), meta, + &values_buffer, &values_bytes, &bytes_written)); } else { const auto& prim_values = checked_cast(values); const auto& fw_type = checked_cast(*values.type()); @@ -760,6 +782,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { VISIT_PRIMITIVE(DoubleArray) VISIT_PRIMITIVE(BinaryArray) VISIT_PRIMITIVE(StringArray) + VISIT_PRIMITIVE(LargeBinaryArray) + VISIT_PRIMITIVE(LargeStringArray) #undef VISIT_PRIMITIVE diff --git a/cpp/src/arrow/ipc/feather.fbs b/cpp/src/arrow/ipc/feather.fbs index a27d39989c6..5ec06299864 100644 --- a/cpp/src/arrow/ipc/feather.fbs +++ b/cpp/src/arrow/ipc/feather.fbs @@ -48,7 +48,10 @@ enum Type : byte { TIMESTAMP = 14, DATE = 15, - TIME = 16 + TIME = 16, + + LARGE_UTF8 = 17, + LARGE_BINARY = 18 } enum Encoding : byte { diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 135296551c9..49a884e1f88 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -312,6 +312,10 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const LargeStringType& type) { return WriteVarBytes("large_utf8", type); } + Status Visit(const LargeBinaryType& type) { + return WriteVarBytes("large_binary", type); + } Status Visit(const FixedSizeBinaryType& type) { return WritePrimitive("fixedsizebinary", type); } @@ -430,20 +434,26 @@ class ArrayWriter { } } - // Binary, encode to hexadecimal. UTF8 string write as is + // Binary, encode to hexadecimal. template - typename std::enable_if::value, void>::type + typename std::enable_if::value || + std::is_same::value, + void>::type WriteDataValues(const T& arr) { for (int64_t i = 0; i < arr.length(); ++i) { - int32_t length; - const uint8_t* buf = arr.GetValue(i, &length); + writer_->String(HexEncode(arr.GetView(i))); + } + } - if (std::is_base_of::value) { - // Presumed UTF-8 - writer_->String(reinterpret_cast(buf), length); - } else { - writer_->String(HexEncode(buf, length)); - } + // UTF8 string, write as is + template + typename std::enable_if::value || + std::is_same::value, + void>::type + WriteDataValues(const T& arr) { + for (int64_t i = 0; i < arr.length(); ++i) { + auto view = arr.GetView(i); + writer_->String(view.data(), static_cast(view.size())); } } @@ -558,8 +568,10 @@ class ArrayWriter { } template - typename std::enable_if::value, Status>::type Visit( - const T& array) { + typename std::enable_if::value || + std::is_base_of::value, + Status>::type + Visit(const T& array) { WriteValidityField(array); WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1); WriteDataField(array); @@ -911,6 +923,10 @@ static Status GetType(const RjObject& json_type, *type = utf8(); } else if (type_name == "binary") { *type = binary(); + } else if (type_name == "large_utf8") { + *type = large_utf8(); + } else if (type_name == "large_binary") { + *type = large_binary(); } else if (type_name == "fixedsizebinary") { return GetFixedSizeBinary(json_type, type); } else if (type_name == "decimal") { @@ -1091,9 +1107,10 @@ class ArrayReader { } template - typename std::enable_if::value, Status>::type Visit( + typename std::enable_if::value, Status>::type Visit( const T& type) { typename TypeTraits::BuilderType builder(pool_); + using offset_type = typename T::offset_type; const auto& json_data = obj_.FindMember(kData); RETURN_NOT_ARRAY(kData, json_data, obj_); @@ -1110,23 +1127,27 @@ class ArrayReader { const rj::Value& val = json_data_arr[i]; DCHECK(val.IsString()); - if (std::is_base_of::value) { + + if (T::is_utf8) { RETURN_NOT_OK(builder.Append(val.GetString())); } else { std::string hex_string = val.GetString(); - DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string"; - int32_t length = static_cast(hex_string.size()) / 2; + if (hex_string.size() % 2 != 0) { + return Status::Invalid("Expected base16 hex string"); + } + const auto value_len = static_cast(hex_string.size()) / 2; std::shared_ptr byte_buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, length, &byte_buffer)); + RETURN_NOT_OK(AllocateBuffer(pool_, value_len, &byte_buffer)); const char* hex_data = hex_string.c_str(); uint8_t* byte_buffer_data = byte_buffer->mutable_data(); - for (int32_t j = 0; j < length; ++j) { + for (int64_t j = 0; j < value_len; ++j) { RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j])); } - RETURN_NOT_OK(builder.Append(byte_buffer_data, length)); + RETURN_NOT_OK( + builder.Append(byte_buffer_data, static_cast(value_len))); } } diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 3c4277512d3..77ab7702109 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -322,6 +322,21 @@ TEST(TestString, Basics) { AssertJSONArray(type, "[\"\\u0000\\u001f\"]", {s}); } +TEST(TestLargeString, Basics) { + // Similar as TestString above, only testing the basics + std::shared_ptr type = large_utf8(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[\"\", \"foo\"]", {"", "foo"}); + AssertJSONArray(type, "[\"\", null]", {true, false}, + {"", ""}); + + // Large binary type + type = large_binary(); + AssertJSONArray(type, "[\"\", \"foo\", null]", + {true, true, false}, {"", "foo", ""}); +} + TEST(TestTimestamp, Basics) { // Timestamp type auto type = timestamp(TimeUnit::SECOND); diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index ac0237fc797..20ac025665a 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -26,6 +26,7 @@ #include "arrow/ipc/json-internal.h" #include "arrow/ipc/json-simple.h" #include "arrow/memory_pool.h" +#include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -344,11 +345,14 @@ class TimestampConverter final : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for binary and string arrays -class StringConverter final : public ConcreteConverter { +template +class StringConverter final : public ConcreteConverter> { public: + using BuilderType = typename TypeTraits::BuilderType; + explicit StringConverter(const std::shared_ptr& type) { this->type_ = type; - builder_ = std::make_shared(type, default_memory_pool()); + builder_ = std::make_shared(type, default_memory_pool()); } Status AppendNull() override { return builder_->AppendNull(); } @@ -368,7 +372,7 @@ class StringConverter final : public ConcreteConverter { std::shared_ptr builder() override { return builder_; } private: - std::shared_ptr builder_; + std::shared_ptr builder_; }; // ------------------------------------------------------------------------ @@ -734,8 +738,10 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) - SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) - SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index e505ddeca9e..93f859a0a07 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -232,6 +232,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type_Binary: *out = binary(); return Status::OK(); + case flatbuf::Type_LargeBinary: + *out = large_binary(); + return Status::OK(); case flatbuf::Type_FixedSizeBinary: { auto fw_binary = static_cast(type_data); *out = fixed_size_binary(fw_binary->byteWidth()); @@ -240,6 +243,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type_Utf8: *out = utf8(); return Status::OK(); + case flatbuf::Type_LargeUtf8: + *out = large_utf8(); + return Status::OK(); case flatbuf::Type_Bool: *out = boolean(); return Status::OK(); @@ -541,12 +547,24 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const LargeBinaryType& type) { + fb_type_ = flatbuf::Type_LargeBinary; + type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); + return Status::OK(); + } + Status Visit(const StringType& type) { fb_type_ = flatbuf::Type_Utf8; type_offset_ = flatbuf::CreateUtf8(fbb_).Union(); return Status::OK(); } + Status Visit(const LargeStringType& type) { + fb_type_ = flatbuf::Type_LargeUtf8; + type_offset_ = flatbuf::CreateLargeUtf8(fbb_).Union(); + return Status::OK(); + } + Status Visit(const Date32Type& type) { fb_type_ = flatbuf::Type_Date; type_offset_ = flatbuf::CreateDate(fbb_, flatbuf::DateUnit_DAY).Union(); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index c39f2d7147b..b9f29d747b4 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -249,8 +249,10 @@ class ArrayLoader { } template - typename std::enable_if::value, Status>::type Visit( - const T& type) { + typename std::enable_if::value || + std::is_base_of::value, + Status>::type + Visit(const T& type) { return LoadBinary(); } diff --git a/cpp/src/arrow/ipc/test-common.cc b/cpp/src/arrow/ipc/test-common.cc index 47c307659f0..4cf13ecc059 100644 --- a/cpp/src/arrow/ipc/test-common.cc +++ b/cpp/src/arrow/ipc/test-common.cc @@ -34,6 +34,7 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" namespace arrow { @@ -205,18 +206,16 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo return builder.Finish(out); } -template +template static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { - Builder builder(pool); + BuilderType builder(pool); for (int64_t i = 0; i < length; ++i) { if (include_nulls && (i % 7 == 0)) { RETURN_NOT_OK(builder.AppendNull()); } else { - const std::string value = std::to_string(i); - RETURN_NOT_OK(builder.Append(reinterpret_cast(value.data()), - static_cast(value.size()))); + RETURN_NOT_OK(builder.Append(std::to_string(i))); } } return builder.Finish(out); @@ -224,28 +223,37 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { const int64_t length = 500; - auto string_type = utf8(); - auto binary_type = binary(); - auto f0 = field("f0", string_type); - auto f1 = field("f1", binary_type); - auto schema = ::arrow::schema({f0, f1}); + auto f0 = field("strings", utf8()); + auto f1 = field("binaries", binary()); + auto f2 = field("large_strings", large_utf8()); + auto f3 = field("large_binaries", large_binary()); + auto schema = ::arrow::schema({f0, f1, f2, f3}); - std::shared_ptr a0, a1; + std::shared_ptr a0, a1, a2, a3; MemoryPool* pool = default_memory_pool(); // Quirk with RETURN_NOT_OK macro and templated functions { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, - pool, &a0); + auto s = + MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a0); RETURN_NOT_OK(s); } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, - pool, &a1); + auto s = + MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a1); RETURN_NOT_OK(s); } - *out = RecordBatch::Make(schema, length, {a0, a1}); + { + auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, + &a2); + RETURN_NOT_OK(s); + } + { + auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, + &a3); + RETURN_NOT_OK(s); + } + *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index e1c2ecacba4..ec372074d8b 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -225,7 +225,8 @@ class RecordBatchSerializer : public ArrayVisitor { template Status GetZeroBasedValueOffsets(const ArrayType& array, std::shared_ptr* value_offsets) { - // Share slicing logic between ListArray and BinaryArray + // Share slicing logic between ListArray, BinaryArray and LargeBinaryArray + using offset_type = typename ArrayType::offset_type; auto offsets = array.value_offsets(); @@ -235,11 +236,12 @@ class RecordBatchSerializer : public ArrayVisitor { // b) slice the values array accordingly std::shared_ptr shifted_offsets; - RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(int32_t) * (array.length() + 1), + RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(offset_type) * (array.length() + 1), &shifted_offsets)); - int32_t* dest_offsets = reinterpret_cast(shifted_offsets->mutable_data()); - const int32_t start_offset = array.value_offset(0); + offset_type* dest_offsets = + reinterpret_cast(shifted_offsets->mutable_data()); + const offset_type start_offset = array.value_offset(0); for (int i = 0; i < array.length(); ++i) { dest_offsets[i] = array.value_offset(i) - start_offset; @@ -253,9 +255,10 @@ class RecordBatchSerializer : public ArrayVisitor { return Status::OK(); } - Status VisitBinary(const BinaryArray& array) { + template + Status VisitBinary(const ArrayType& array) { std::shared_ptr value_offsets; - RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); + RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); auto data = array.value_data(); int64_t total_data_bytes = 0; @@ -343,6 +346,10 @@ class RecordBatchSerializer : public ArrayVisitor { Status Visit(const BinaryArray& array) override { return VisitBinary(array); } + Status Visit(const LargeStringArray& array) override { return VisitBinary(array); } + + Status Visit(const LargeBinaryArray& array) override { return VisitBinary(array); } + Status Visit(const ListArray& array) override { return VisitList(array); } Status Visit(const MapArray& array) override { return VisitList(array); } diff --git a/cpp/src/arrow/json/converter-test.cc b/cpp/src/arrow/json/converter-test.cc index 86e8e8dc84a..cf09e617dec 100644 --- a/cpp/src/arrow/json/converter-test.cc +++ b/cpp/src/arrow/json/converter-test.cc @@ -85,6 +85,11 @@ TEST(ConverterTest, String) { AssertConvert(utf8(), src, src); } +TEST(ConverterTest, LargeString) { + std::string src = R"(["a", "b c", null, "d e f", "g"])"; + AssertConvert(large_utf8(), src, src); +} + TEST(ConverterTest, Timestamp) { std::string src = R"([null, "1970-01-01", "2018-11-13 17:11:10"])"; AssertConvert(timestamp(TimeUnit::SECOND), src, src); diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 078e3141869..6b7b730865b 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -264,6 +264,8 @@ Status MakeConverter(const std::shared_ptr& out_type, MemoryPool* pool CONVERTER_CASE(Type::DATE64, DateTimeConverter); CONVERTER_CASE(Type::BINARY, BinaryConverter); CONVERTER_CASE(Type::STRING, BinaryConverter); + CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter); + CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter); default: return Status::NotImplemented("JSON conversion to ", *out_type, " is not supported"); diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index c77a92b7fa6..cdb230c6c3e 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -155,6 +155,7 @@ TEST_F(TestPrettyPrint, PrimitiveType) { null ])expected"; CheckPrimitive({0, 10}, is_valid, values3, ex3); + CheckPrimitive({0, 10}, is_valid, values3, ex3); static const char* ex3_in2 = R"expected( [ "foo", "bar", @@ -163,6 +164,7 @@ TEST_F(TestPrettyPrint, PrimitiveType) { null ])expected"; CheckPrimitive({2, 10}, is_valid, values3, ex3_in2); + CheckPrimitive({2, 10}, is_valid, values3, ex3_in2); } TEST_F(TestPrettyPrint, Int8) { @@ -338,9 +340,11 @@ TEST_F(TestPrettyPrint, BinaryType) { std::vector values = {"foo", "bar", "", "baz", "", "\xff"}; static const char* ex = "[\n 666F6F,\n 626172,\n null,\n 62617A,\n ,\n FF\n]"; CheckPrimitive({0}, is_valid, values, ex); + CheckPrimitive({0}, is_valid, values, ex); static const char* ex_in2 = " [\n 666F6F,\n 626172,\n null,\n 62617A,\n ,\n FF\n ]"; CheckPrimitive({2}, is_valid, values, ex_in2); + CheckPrimitive({2}, is_valid, values, ex_in2); } TEST_F(TestPrettyPrint, ListType) { diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 6caef1714bf..5a54e13b889 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -205,7 +205,9 @@ class ArrayPrinter : public PrettyPrinter { // String (Utf8) template - inline typename std::enable_if::value, Status>::type + inline typename std::enable_if::value || + std::is_same::value, + Status>::type WriteDataValues(const T& array) { WriteValues(array, [&](int64_t i) { (*sink_) << "\"" << array.GetView(i) << "\""; }); return Status::OK(); @@ -213,7 +215,9 @@ class ArrayPrinter : public PrettyPrinter { // Binary template - inline typename std::enable_if::value, Status>::type + inline typename std::enable_if::value || + std::is_same::value, + Status>::type WriteDataValues(const T& array) { WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); }); return Status::OK(); @@ -314,6 +318,7 @@ class ArrayPrinter : public PrettyPrinter { typename std::enable_if::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || std::is_base_of::value, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 4f0589a2f5c..76aecd01fd3 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -91,20 +91,22 @@ struct NumericScalar : public internal::PrimitiveScalar { : internal::PrimitiveScalar{type, is_valid}, value(value) {} }; -struct ARROW_EXPORT BinaryScalar : public Scalar { +template +struct BaseBinaryScalar : public Scalar { std::shared_ptr value; - explicit BinaryScalar(const std::shared_ptr& value, bool is_valid = true) - : BinaryScalar(value, binary(), is_valid) {} protected: - BinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type, bool is_valid = true) + BaseBinaryScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true) : Scalar{type, is_valid}, value(value) {} }; -struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar { - FixedSizeBinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type, bool is_valid = true); +struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { + explicit BinaryScalar(const std::shared_ptr& value, bool is_valid = true) + : BaseBinaryScalar(value, binary(), is_valid) {} + + protected: + using BaseBinaryScalar::BaseBinaryScalar; }; struct ARROW_EXPORT StringScalar : public BinaryScalar { @@ -112,6 +114,24 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { : BinaryScalar(value, utf8(), is_valid) {} }; +struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { + explicit LargeBinaryScalar(const std::shared_ptr& value, bool is_valid = true) + : BaseBinaryScalar(value, large_binary(), is_valid) {} + + protected: + using BaseBinaryScalar::BaseBinaryScalar; +}; + +struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { + explicit LargeStringScalar(const std::shared_ptr& value, bool is_valid = true) + : LargeBinaryScalar(value, utf8(), is_valid) {} +}; + +struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar { + FixedSizeBinaryScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true); +}; + class ARROW_EXPORT Date32Scalar : public NumericScalar { public: using NumericScalar::NumericScalar; diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 3346e631ab5..a6f03a39eb4 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -145,24 +145,30 @@ PRIMITIVE_RAND_FLOAT_IMPL(Float64, double, DoubleType) #undef PRIMITIVE_RAND_FLOAT_IMPL #undef PRIMITIVE_RAND_IMPL -std::shared_ptr RandomArrayGenerator::String(int64_t size, - int32_t min_length, - int32_t max_length, - double null_probability) { +template +static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, + int64_t size, int32_t min_length, + int32_t max_length, + double null_probability) { + using offset_type = typename TypeClass::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + using OffsetArrayType = typename TypeTraits::ArrayType; + if (null_probability < 0 || null_probability > 1) { ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1")); } - auto int32_lengths = Int32(size, min_length, max_length, null_probability); - auto lengths = std::dynamic_pointer_cast(int32_lengths); + auto lengths = std::dynamic_pointer_cast( + gen->Numeric(size, min_length, max_length, null_probability)); // Visual Studio does not implement uniform_int_distribution for char types. using GenOpt = GenerateOptions>; - GenOpt options(seed(), static_cast('A'), static_cast('z'), + GenOpt options(gen->seed(), static_cast('A'), static_cast('z'), /*null_probability=*/0); std::vector str_buffer(max_length); - StringBuilder builder; + BuilderType builder; for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -178,6 +184,22 @@ std::shared_ptr RandomArrayGenerator::String(int64_t size, return result; } +std::shared_ptr RandomArrayGenerator::String(int64_t size, + int32_t min_length, + int32_t max_length, + double null_probability) { + return GenerateBinaryArray(this, size, min_length, max_length, + null_probability); +} + +std::shared_ptr RandomArrayGenerator::LargeString(int64_t size, + int32_t min_length, + int32_t max_length, + double null_probability) { + return GenerateBinaryArray(this, size, min_length, max_length, + null_probability); +} + std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability) { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 3126a6901d0..75f6bdf4d6c 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -214,6 +214,19 @@ class ARROW_EXPORT RandomArrayGenerator { std::shared_ptr String(int64_t size, int32_t min_length, int32_t max_length, double null_probability); + /// \brief Generates a random LargeStringArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr LargeString(int64_t size, int32_t min_length, + int32_t max_length, double null_probability); + /// \brief Generates a random StringArray with repeated values /// /// \param[in] size the size of the array to generate @@ -235,9 +248,9 @@ class ARROW_EXPORT RandomArrayGenerator { int32_t min_length, int32_t max_length, double null_probability); - private: SeedType seed() { return seed_distribution_(seed_rng_); } + private: std::uniform_int_distribution seed_distribution_; std::default_random_engine seed_rng_; }; diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 7ad1d8ad05d..7bfb7200171 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -354,6 +354,20 @@ TEST(TestStringType, ToString) { ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestLargeBinaryTypes, ToString) { + BinaryType bt1; + LargeBinaryType t1; + LargeBinaryType e1; + LargeStringType t2; + EXPECT_TRUE(t1.Equals(e1)); + EXPECT_FALSE(t1.Equals(t2)); + EXPECT_FALSE(t1.Equals(bt1)); + ASSERT_EQ(t1.id(), Type::LARGE_BINARY); + ASSERT_EQ(t1.ToString(), std::string("large_binary")); + ASSERT_EQ(t2.id(), Type::LARGE_STRING); + ASSERT_EQ(t2.ToString(), std::string("large_string")); +} + TEST(TestFixedSizeBinaryType, ToString) { auto t = fixed_size_binary(10); ASSERT_EQ(t->id(), Type::FIXED_SIZE_BINARY); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 76be841a662..dc00a796913 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -143,8 +143,6 @@ FloatingPointType::Precision DoubleType::precision() const { return FloatingPointType::DOUBLE; } -std::string StringType::ToString() const { return std::string("string"); } - std::string ListType::ToString() const { std::stringstream s; s << "list<" << value_field()->ToString() << ">"; @@ -178,7 +176,13 @@ std::string FixedSizeListType::ToString() const { return s.str(); } -std::string BinaryType::ToString() const { return std::string("binary"); } +std::string BinaryType::ToString() const { return "binary"; } + +std::string LargeBinaryType::ToString() const { return "large_binary"; } + +std::string StringType::ToString() const { return "string"; } + +std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -671,7 +675,9 @@ TYPE_FACTORY(float16, HalfFloatType) TYPE_FACTORY(float32, FloatType) TYPE_FACTORY(float64, DoubleType) TYPE_FACTORY(utf8, StringType) +TYPE_FACTORY(large_utf8, LargeStringType) TYPE_FACTORY(binary, BinaryType) +TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index fc235bb2d67..572b888df11 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -143,7 +143,13 @@ struct Type { /// Measure of elapsed time in either seconds, milliseconds, microseconds /// or nanoseconds. - DURATION + DURATION, + + /// Like STRING, but with 64-bit offsets + LARGE_STRING, + + /// Like BINARY, but with 64-bit offsets + LARGE_BINARY }; }; @@ -472,6 +478,7 @@ class ARROW_EXPORT DoubleType class ARROW_EXPORT ListType : public NestedType { public: static constexpr Type::type type_id = Type::LIST; + using offset_type = int32_t; // List can contain any other logical value type explicit ListType(const std::shared_ptr& value_type) @@ -486,7 +493,7 @@ class ARROW_EXPORT ListType : public NestedType { std::shared_ptr value_type() const { return children_[0]->type(); } DataTypeLayout layout() const override { - return {{1, CHAR_BIT * sizeof(int32_t)}, false}; + return {{1, CHAR_BIT * sizeof(offset_type)}, false}; } std::string ToString() const override; @@ -550,23 +557,78 @@ class ARROW_EXPORT FixedSizeListType : public NestedType { int32_t list_size_; }; +/// \brief Base class for all variable-size binary data types +class ARROW_EXPORT BaseBinaryType : public DataType, public NoExtraMeta { + public: + using DataType::DataType; +}; + /// \brief Concrete type class for variable-size binary data -class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { +class ARROW_EXPORT BinaryType : public BaseBinaryType { public: static constexpr Type::type type_id = Type::BINARY; + static constexpr bool is_utf8 = false; + using offset_type = int32_t; BinaryType() : BinaryType(Type::BINARY) {} DataTypeLayout layout() const override { - return {{1, CHAR_BIT * sizeof(int32_t), DataTypeLayout::kVariableSizeBuffer}, false}; + return {{1, CHAR_BIT * sizeof(offset_type), DataTypeLayout::kVariableSizeBuffer}, + false}; } std::string ToString() const override; std::string name() const override { return "binary"; } protected: - // Allow subclasses to change the logical type. - explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} + // Allow subclasses like StringType to change the logical type. + explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} +}; + +/// \brief Concrete type class for large variable-size binary data +class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { + public: + static constexpr Type::type type_id = Type::LARGE_BINARY; + static constexpr bool is_utf8 = false; + using offset_type = int64_t; + + LargeBinaryType() : LargeBinaryType(Type::LARGE_BINARY) {} + + DataTypeLayout layout() const override { + return {{1, CHAR_BIT * sizeof(offset_type), DataTypeLayout::kVariableSizeBuffer}, + false}; + } + + std::string ToString() const override; + std::string name() const override { return "large_binary"; } + + protected: + // Allow subclasses like LargeStringType to change the logical type. + explicit LargeBinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} +}; + +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringType : public BinaryType { + public: + static constexpr Type::type type_id = Type::STRING; + static constexpr bool is_utf8 = true; + + StringType() : BinaryType(Type::STRING) {} + + std::string ToString() const override; + std::string name() const override { return "utf8"; } +}; + +/// \brief Concrete type class for large variable-size string data, utf8-encoded +class ARROW_EXPORT LargeStringType : public LargeBinaryType { + public: + static constexpr Type::type type_id = Type::LARGE_STRING; + static constexpr bool is_utf8 = true; + + LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {} + + std::string ToString() const override; + std::string name() const override { return "large_utf8"; } }; /// \brief Concrete type class for fixed-size binary data @@ -591,17 +653,6 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri int32_t byte_width_; }; -/// \brief Concrete type class for variable-size string data, utf8-encoded -class ARROW_EXPORT StringType : public BinaryType { - public: - static constexpr Type::type type_id = Type::STRING; - - StringType() : BinaryType(Type::STRING) {} - - std::string ToString() const override; - std::string name() const override { return "utf8"; } -}; - /// \brief Concrete type class for struct data class ARROW_EXPORT StructType : public NestedType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index c42d66152da..9935af5111f 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -65,6 +65,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class LargeBinaryType; +class LargeBinaryArray; +class LargeBinaryBuilder; +struct LargeBinaryScalar; + class FixedSizeBinaryType; class FixedSizeBinaryArray; class FixedSizeBinaryBuilder; @@ -75,6 +80,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class LargeStringType; +class LargeStringArray; +class LargeStringBuilder; +struct LargeStringScalar; + class ListType; class ListArray; class ListBuilder; @@ -218,8 +228,12 @@ std::shared_ptr ARROW_EXPORT float32(); std::shared_ptr ARROW_EXPORT float64(); /// \brief Return a StringType instance std::shared_ptr ARROW_EXPORT utf8(); +/// \brief Return a LargeStringType instance +std::shared_ptr ARROW_EXPORT large_utf8(); /// \brief Return a BinaryType instance std::shared_ptr ARROW_EXPORT binary(); +/// \brief Return a LargeBinaryType instance +std::shared_ptr ARROW_EXPORT large_binary(); /// \brief Return a Date32Type instance std::shared_ptr ARROW_EXPORT date32(); /// \brief Return a Date64Type instance diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index c4c549f371b..df3e280d3f2 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -243,6 +243,15 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return binary(); } }; +template <> +struct TypeTraits { + using ArrayType = LargeBinaryArray; + using BuilderType = LargeBinaryBuilder; + using ScalarType = LargeBinaryScalar; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return large_binary(); } +}; + template <> struct TypeTraits { using ArrayType = FixedSizeBinaryArray; @@ -260,6 +269,15 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = LargeStringArray; + using BuilderType = LargeStringBuilder; + using ScalarType = LargeStringScalar; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return large_utf8(); } +}; + template <> struct CTypeTraits : public TypeTraits { using ArrowType = StringType; @@ -367,6 +385,12 @@ struct is_8bit_int { (std::is_same::value || std::is_same::value); }; +template +struct is_any_string_type { + static constexpr bool value = + std::is_same::value || std::is_same::value; +}; + template using enable_if_8bit_int = typename std::enable_if::value, R>::type; @@ -418,10 +442,18 @@ using enable_if_has_c_type = typename std::enable_if::value, R>::t template using enable_if_null = typename std::enable_if::value, R>::type; +template +using enable_if_base_binary = + typename std::enable_if::value, R>::type; + template using enable_if_binary = typename std::enable_if::value, R>::type; +template +using enable_if_large_binary = + typename std::enable_if::value, R>::type; + template using enable_if_boolean = typename std::enable_if::value, R>::type; @@ -580,6 +612,17 @@ static inline bool is_binary_like(Type::type type_id) { return false; } +static inline bool is_large_binary_like(Type::type type_id) { + switch (type_id) { + case Type::LARGE_BINARY: + case Type::LARGE_STRING: + return true; + default: + break; + } + return false; +} + static inline bool is_dictionary(Type::type type_id) { return type_id == Type::DICTIONARY; } diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index 53b341b53d7..2ec6c6421d0 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -47,6 +47,8 @@ ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(LargeBinaryArray) +ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) ARRAY_VISITOR_DEFAULT(Date32Array) ARRAY_VISITOR_DEFAULT(Date64Array) @@ -90,6 +92,8 @@ TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(LargeStringType) +TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) TYPE_VISITOR_DEFAULT(Date64Type) TYPE_VISITOR_DEFAULT(Date32Type) @@ -134,6 +138,8 @@ SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(LargeStringScalar) +SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) SCALAR_VISITOR_DEFAULT(Date64Scalar) SCALAR_VISITOR_DEFAULT(Date32Scalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index a4979e9cef8..1c854c47804 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -43,6 +43,8 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const LargeStringArray& array); + virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); virtual Status Visit(const Date32Array& array); virtual Status Visit(const Date64Array& array); @@ -81,6 +83,8 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const LargeStringType& type); + virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); virtual Status Visit(const Date64Type& type); virtual Status Visit(const Date32Type& type); @@ -119,6 +123,8 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const LargeStringScalar& scalar); + virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); virtual Status Visit(const Date64Scalar& scalar); virtual Status Visit(const Date32Scalar& scalar); diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 544763a2f74..3ed058e6492 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -47,6 +47,8 @@ namespace arrow { ACTION(Double); \ ACTION(String); \ ACTION(Binary); \ + ACTION(LargeString); \ + ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ ACTION(Duration); \ ACTION(Date32); \ @@ -186,12 +188,13 @@ struct ArrayDataVisitor> { }; template -struct ArrayDataVisitor> { +struct ArrayDataVisitor> { template static Status Visit(const ArrayData& arr, Visitor* visitor) { + using offset_type = typename T::offset_type; constexpr uint8_t empty_value = 0; - const int32_t* offsets = arr.GetValues(1); + const offset_type* offsets = arr.GetValues(1); const uint8_t* data; if (!arr.buffers[2]) { data = &empty_value; diff --git a/format/Schema.fbs b/format/Schema.fbs index 36127925eff..06bcf6ee670 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -107,9 +107,20 @@ table FloatingPoint { table Utf8 { } +/// Opaque binary data table Binary { } +/// Same as Utf8, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeUtf8 { +} + +/// Same as Binary, but with 64-bit offsets, allowing to represent +/// extremely large data values. +table LargeBinary { +} + table FixedSizeBinary { /// Number of bytes per value byteWidth: int; @@ -235,6 +246,8 @@ union Type { FixedSizeList, Map, Duration, + LargeBinary, + LargeUtf8, } /// ----------------------------------------------------------------------