diff --git a/LICENSE.txt b/LICENSE.txt index c2b0a996fed..06347f5445d 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1894,7 +1894,7 @@ This project includes code from the autobrew project. The following files are based on code from the autobrew project: * r/tools/autobrew * dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb -* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb +* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb Copyright (c) 2019, Jeroen Ooms License: MIT @@ -1976,6 +1976,20 @@ License: http://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- +This project includes code from Velox. + + * cpp/src/arrow/util/string_header.h + +is based on Velox's + + * velox/type/StringView.h + +Copyright: Copyright (c) Facebook, Inc. and its affiliates. +Home page: https://github.com/facebookincubator/velox +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + The file cpp/src/arrow/vendored/musl/strptime.c has the following license Copyright © 2005-2020 Rich Felker, et al. diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9e8ecb5ceb3..6b657738536 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4593,6 +4593,7 @@ macro(build_opentelemetry) -DWITH_OTLP=ON -DWITH_OTLP_HTTP=ON -DWITH_OTLP_GRPC=OFF + -DWITH_STL=ON "-DProtobuf_INCLUDE_DIR=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" "-DProtobuf_LIBRARY=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" "-DProtobuf_PROTOC_EXECUTABLE=${OPENTELEMETRY_PROTOC_EXECUTABLE}") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 00cf899349a..bd093edaad8 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -226,6 +226,7 @@ set(ARROW_SRCS util/ree_util.cc util/string.cc util/string_builder.cc + util/string_header.cc util/task_group.cc util/tdigest.cc util/thread_pool.cc diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index f7b8d7954e1..07c18998253 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -87,6 +87,10 @@ struct ScalarFromArraySlotImpl { return Finish(a.GetString(index_)); } + Status Visit(const BinaryViewArray& a) { + return Finish(std::string{a.GetView(index_)}); + } + Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); } diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 9466b5a48f9..7790c32ae3f 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -89,6 +89,28 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } +BinaryViewArray::BinaryViewArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); + SetData(data); +} + +BinaryViewArray::BinaryViewArray(int64_t length, std::shared_ptr headers, + BufferVector char_buffers, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) + : PrimitiveArray(binary_view(), length, std::move(headers), std::move(null_bitmap), + null_count, offset) { + data_->buffers.resize(char_buffers.size() + 2); + std::move(char_buffers.begin(), char_buffers.end(), data_->buffers.begin() + 2); +} + +StringViewArray::StringViewArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); + SetData(data); +} + +Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } + FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 7e58a96ff84..70e477b2741 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -217,6 +218,84 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { Status ValidateUTF8() const; }; +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// StringHeader struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + + explicit BinaryViewArray(const std::shared_ptr& data); + + BinaryViewArray(int64_t length, std::shared_ptr headers, + BufferVector char_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const StringHeader* raw_values() const { + return reinterpret_cast(raw_values_) + data_->offset; + } + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const { + const auto& s = raw_values()[i]; + if (raw_pointers_) { + return std::string_view{s}; + } + if (s.IsInline()) { + return {s.GetInlineData(), s.size()}; + } + auto* char_buffers = data_->buffers.data() + 2; + return {char_buffers[s.GetBufferIndex()]->data_as() + s.GetBufferOffset(), + s.size()}; + } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + bool has_raw_pointers() const { return raw_pointers_; } + + protected: + using PrimitiveArray::PrimitiveArray; + + void SetData(const std::shared_ptr& data) { + PrimitiveArray::SetData(data); + raw_pointers_ = + internal::checked_cast(*type()).has_raw_pointers(); + } + + bool raw_pointers_ = false; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// StringHeader to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(const std::shared_ptr& data); + + StringViewArray(int64_t length, std::shared_ptr data, BufferVector char_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : BinaryViewArray(length, std::move(data), std::move(char_buffers), + std::move(null_bitmap), null_count, offset) { + data_->type = utf8_view(); + } + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 3bc9bb91a02..0453c9f9070 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -27,17 +27,21 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -365,38 +369,211 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); } +namespace string_header_helpers { + +StringHeader Inline(std::string_view chars) { + DCHECK(StringHeader::IsInline(chars.size())); + return StringHeader{chars}; +} + +StringHeader NotInline(std::string_view prefix, size_t length, size_t buffer_index, + size_t offset) { + DCHECK_EQ(prefix.size(), 4); + DCHECK(!StringHeader::IsInline(length)); + StringHeader s{prefix.data(), length}; + s.SetIndexOffset(static_cast(buffer_index), static_cast(offset)); + return s; +} + +// Produce an Array of index/offset views from a std::vector of index/offset StringHeader +Result> MakeViewArray( + BufferVector char_buffers, const std::vector& headers, + bool validate = true) { + auto length = static_cast(headers.size()); + ARROW_ASSIGN_OR_RAISE(auto headers_buf, CopyBufferFromVector(headers)); + auto arr = std::make_shared(length, std::move(headers_buf), + std::move(char_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +// Produce an Array of index/offset views from a std::vector of raw pointer StringHeader +Result> MakeViewArrayFromRaw( + BufferVector char_buffers, const std::vector& raw, + bool validate = true) { + ARROW_ASSIGN_OR_RAISE(auto raw_buf, CopyBufferFromVector(raw)); + StringViewArray raw_arr{static_cast(raw.size()), std::move(raw_buf), + char_buffers}; + raw_arr.data()->type = utf8_view(/*has_raw_pointers=*/true); + + ARROW_ASSIGN_OR_RAISE(auto io_buf, AllocateBuffer(raw.size() * sizeof(StringHeader))); + RETURN_NOT_OK(internal::SwapStringHeaderPointers( + *raw_arr.data(), io_buf->mutable_data_as())); + + auto arr = std::make_shared(raw.size(), std::move(io_buf), + std::move(char_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +} // namespace string_header_helpers + +TEST(StringViewArray, Validate) { + using string_header_helpers::Inline; + using string_header_helpers::MakeViewArray; + using string_header_helpers::NotInline; + + // Since this is a test of validation, we need to be able to construct invalid arrays. + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + // empty array is valid + EXPECT_THAT(MakeViewArray({}, {}), Ok()); + + // empty array with some character buffers is valid + EXPECT_THAT(MakeViewArray({buffer_s, buffer_y}, {}), Ok()); + + // inline views need not have a corresponding buffer + EXPECT_THAT(MakeViewArray({}, {Inline("hello"), Inline("world"), Inline("inline me")}), + Ok()); + + // non-inline views are expected to reference only buffers managed by the array + EXPECT_THAT( + MakeViewArray({buffer_s, buffer_y}, {NotInline("supe", buffer_s->size(), 0, 0), + NotInline("yyyy", buffer_y->size(), 1, 0)}), + Ok()); + + // views may not reference char buffers not present in the array + EXPECT_THAT(MakeViewArray({}, {NotInline("supe", buffer_s->size(), 0, 0)}), + Raises(StatusCode::IndexError)); + // ... or ranges which overflow the referenced char buffer + EXPECT_THAT(MakeViewArray({buffer_s}, {NotInline("supe", buffer_s->size() + 50, 0, 0)}), + Raises(StatusCode::IndexError)); + + // Additionally, the prefixes of non-inline views must match the character buffer + EXPECT_THAT( + MakeViewArray({buffer_s, buffer_y}, {NotInline("SUPE", buffer_s->size(), 0, 0), + NotInline("yyyy", buffer_y->size(), 1, 0)}), + Raises(StatusCode::Invalid)); + + // Invalid string views which are masked by a null bit do not cause validation to fail + auto invalid_but_masked = MakeViewArray({buffer_s}, + {NotInline("SUPE", buffer_s->size(), 0, 0), + NotInline("yyyy", 50, 40, 30)}, + /*validate=*/false) + .ValueOrDie() + ->data(); + invalid_but_masked->null_count = 2; + invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2); + EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok()); + + // overlapping views are allowed + EXPECT_THAT(MakeViewArray({buffer_s}, + { + NotInline("supe", buffer_s->size(), 0, 0), + NotInline("uper", buffer_s->size() - 1, 0, 1), + NotInline("perc", buffer_s->size() - 2, 0, 2), + NotInline("erca", buffer_s->size() - 3, 0, 3), + }), + Ok()); +} + +TEST(StringViewArray, BinaryViewArrayFromRawPointerViews) { + using string_header_helpers::Inline; + using string_header_helpers::MakeViewArray; + using string_header_helpers::MakeViewArrayFromRaw; + using string_header_helpers::NotInline; + + auto Roundtrip = [&](Result> maybe_arr) { + ARROW_ASSIGN_OR_RAISE(auto arr, maybe_arr); + + std::vector raw(arr->length()); + RETURN_NOT_OK(internal::SwapStringHeaderPointers(*arr->data(), raw.data())); + for (size_t i = 0; i < raw.size(); ++i) { + if (static_cast(raw[i]) != arr->GetView(i)) { + return Status::Invalid("Produced incorrect raw pointer headers"); + } + } + + BufferVector char_buffers{arr->data()->buffers.begin() + 2, + arr->data()->buffers.end()}; + ARROW_ASSIGN_OR_RAISE(auto round_tripped, + MakeViewArrayFromRaw(std::move(char_buffers), raw)); + + if (round_tripped->Equals(arr)) { + return Status::OK(); + } + return Status::Invalid("not equal"); + }; + + EXPECT_THAT(Roundtrip(MakeViewArray( + {}, {Inline("hello"), Inline("world"), Inline("inline me")})), + Ok()); + + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + EXPECT_THAT(Roundtrip(MakeViewArray({buffer_s, buffer_y}, + { + NotInline("supe", buffer_s->size(), 0, 0), + Inline("hello"), + NotInline("yyyy", buffer_y->size(), 1, 0), + Inline("world"), + NotInline("uper", buffer_s->size() - 1, 0, 1), + })), + Ok()); + + // use a larger number of buffers to test the binary search case + BufferVector buffers; + std::vector headers; + for (size_t i = 0; i < 40; ++i) { + buffers.push_back(Buffer::FromString(std::string(13, 'c'))); + headers.push_back(NotInline("cccc", 13, i, 0)); + } + EXPECT_THAT(Roundtrip(MakeViewArray(buffers, headers)), Ok()); + + EXPECT_THAT( + MakeViewArrayFromRaw({buffer_s, buffer_y}, + { + "not inlined, outside buffers", + }), + Raises(StatusCode::IndexError, + testing::HasSubstr("pointed outside the provided character buffers"))); +} + template class TestUTF8Array : public ::testing::Test { public: using TypeClass = T; - using offset_type = typename TypeClass::offset_type; using ArrayType = typename TypeTraits::ArrayType; - Status ValidateUTF8(int64_t length, std::vector offsets, - std::string_view data, int64_t offset = 0) { - ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), - /*null_bitmap=*/nullptr, /*null_count=*/0, offset); - return arr.ValidateUTF8(); + Status ValidateUTF8(const Array& arr) { + return checked_cast(arr).ValidateUTF8(); } - Status ValidateUTF8(const std::string& json) { - auto ty = TypeTraits::type_singleton(); - auto arr = ArrayFromJSON(ty, json); - return checked_cast(*arr).ValidateUTF8(); + Status ValidateUTF8(std::vector values) { + std::shared_ptr arr; + ArrayFromVector(values, &arr); + return ValidateUTF8(*arr); } void TestValidateUTF8() { - ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])")); - ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff + ASSERT_OK(ValidateUTF8(*ArrayFromJSON(TypeTraits::type_singleton(), + R"(["Voix", "ambiguë", "d’un", "cœur"])"))); + ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff - ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4")); + ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"})); // More tests in TestValidateData() above // (ValidateFull() calls ValidateUTF8() internally) } }; -TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes); +TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes); TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); } @@ -883,11 +1060,15 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { void SetUp() override { type_ = TypeTraits::type_singleton(); } void TestBasics() { - auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])"); + auto array = ArrayFromJSON( + type_, + R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])"); BinaryAppender appender; ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); - ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"})); + ASSERT_THAT(appender.data, + ::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me", + "allocate_me_aaaaa", "allocate_me_bbbb"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning } @@ -904,7 +1085,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes); +TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 602a468fafb..ff87e0b8ddb 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -594,12 +594,14 @@ static ScalarVector GetScalars() { std::make_shared(60, duration(TimeUnit::SECOND)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), @@ -646,7 +648,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_EQ(array->null_count(), 0); // test case for ARROW-13321 - for (int64_t i : std::vector{0, length / 2, length - 1}) { + for (int64_t i : {int64_t{0}, length / 2, length - 1}) { ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i)); AssertScalarsEqual(*s, *scalar, /*verbose=*/true); } @@ -720,32 +722,32 @@ TEST_F(TestArray, TestMakeEmptyArray) { FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))}); std::vector union_type_codes{7, 42}; - std::shared_ptr types[] = {null(), - boolean(), - int8(), - uint16(), - int32(), - uint64(), - float64(), - binary(), - large_binary(), - fixed_size_binary(3), - decimal(16, 4), - utf8(), - large_utf8(), - list(utf8()), - list(int64()), - large_list(large_utf8()), - fixed_size_list(utf8(), 3), - fixed_size_list(int64(), 4), - dictionary(int32(), utf8()), - struct_({field("a", utf8()), field("b", int32())}), - sparse_union(union_fields1, union_type_codes), - sparse_union(union_fields2, union_type_codes), - dense_union(union_fields1, union_type_codes), - dense_union(union_fields2, union_type_codes)}; - - for (auto type : types) { + for (auto type : {null(), + boolean(), + int8(), + uint16(), + int32(), + uint64(), + float64(), + binary(), + binary_view(), + large_binary(), + fixed_size_binary(3), + decimal(16, 4), + utf8(), + utf8_view(), + large_utf8(), + list(utf8()), + list(int64()), + large_list(large_utf8()), + fixed_size_list(utf8(), 3), + fixed_size_list(int64(), 4), + dictionary(int32(), utf8()), + struct_({field("a", utf8()), field("b", int32())}), + sparse_union(union_fields1, union_type_codes), + sparse_union(union_fields2, union_type_codes), + dense_union(union_fields1, union_type_codes), + dense_union(union_fields2, union_type_codes)}) { ARROW_SCOPED_TRACE("type = ", type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type)); ASSERT_OK(array->ValidateFull()); diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index 07dc3014e40..97110ea97f3 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -126,6 +126,38 @@ TEST(TestArrayView, StringAsBinary) { CheckView(expected, arr); } +TEST(TestArrayView, StringViewAsBinaryView) { + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = ArrayFromJSON(utf8_view(), json); + auto expected = ArrayFromJSON(binary_view(), json); + CheckView(arr, expected); + CheckView(expected, arr); + } +} + +TEST(TestArrayView, StringViewAsBinaryViewInStruct) { + auto padl = ArrayFromJSON(list(int16()), "[[0, -1], [], [42]]"); + auto padr = ArrayFromJSON(utf8(), R"(["foox", "barz", null])"); + + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = + StructArray::Make({padl, ArrayFromJSON(utf8_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + auto expected = + StructArray::Make({padl, ArrayFromJSON(binary_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + + CheckView(arr, expected); + CheckView(expected, arr); + } +} + TEST(TestArrayView, PrimitiveWrongSize) { auto arr = ArrayFromJSON(int16(), "[0, -1, 42]"); CheckViewFails(arr, int8()); diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 70da1fbb296..9cdfb0b4681 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -104,10 +104,7 @@ namespace { template struct AppendScalarImpl { template - enable_if_t::value || is_decimal_type::value || - is_fixed_size_binary_type::value, - Status> - Visit(const T&) { + Status HandleFixedWidth(const T&) { auto builder = checked_cast::BuilderType*>(builder_); RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); @@ -125,7 +122,17 @@ struct AppendScalarImpl { } template - enable_if_base_binary Visit(const T&) { + enable_if_t::value, Status> Visit(const T& t) { + return HandleFixedWidth(t); + } + + Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } + + template + enable_if_t::value || is_string_like_type::value, Status> + Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 571f450aab9..b5ed7187edb 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -40,6 +40,97 @@ namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// Binary/StringView +BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : BinaryViewBuilder(pool) { + ARROW_CHECK(!checked_cast(*type).has_raw_pointers()); +} + +Status BinaryViewBuilder::AppendValues(const std::vector& values, + const uint8_t* valid_bytes) { + // We only need to allocate memory for the out-of-line strings + std::size_t out_of_line_total = std::accumulate( + values.begin(), values.end(), 0ULL, [](uint64_t sum, const std::string& str) { + size_t length = str.size(); + return sum + (length > StringHeader::kInlineSize ? length : 0); + }); + RETURN_NOT_OK(Reserve(values.size())); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + + if (valid_bytes != nullptr) { + for (std::size_t i = 0; i < values.size(); ++i) { + if (valid_bytes[i]) { + UnsafeAppend(values[i]); + } else { + UnsafeAppendNull(); + } + } + } else { + for (const auto& value : values) { + UnsafeAppend(value); + } + } + UnsafeAppendToBitmap(valid_bytes, values.size()); + return Status::OK(); +} + +Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) { + auto bitmap = array.GetValues(0, 0); + auto values = array.GetValues(1) + offset; + + int64_t out_of_line_total = 0; + for (int64_t i = 0; i < length; i++) { + if (!values[i].IsInline()) { + out_of_line_total += static_cast(values[i].size()); + } + } + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) { + if (values[i].IsInline()) { + UnsafeAppend(values[i]); + } else { + UnsafeAppend(values[i].data(), values[i].size()); + } + } else { + UnsafeAppendNull(); + } + } + return Status::OK(); +} + +Status BinaryViewBuilder::FinishInternal(std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + BufferVector buffers = {null_bitmap, data}; + for (auto&& buffer : data_heap_builder_.Finish()) { + buffers.push_back(std::move(buffer)); + } + *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); + capacity_ = length_ = null_count_ = 0; + Reset(); + return Status::OK(); +} + +Status BinaryViewBuilder::ReserveData(int64_t length) { + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + return data_heap_builder_.Reserve(length); +} + +void BinaryViewBuilder::Reset() { + ArrayBuilder::Reset(); + data_builder_.Reset(); + data_heap_builder_.Reset(); +} + // ---------------------------------------------------------------------- // Fixed width binary @@ -125,8 +216,8 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { std::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { const uint8_t* data_ptr = byte_builder_.data(); - return std::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); + return {reinterpret_cast(data_ptr + i * byte_width_), + static_cast(byte_width_)}; } // ---------------------------------------------------------------------- @@ -173,10 +264,10 @@ Status ChunkedStringBuilder::Finish(ArrayVector* out) { RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); // Change data type to string/utf8 - for (size_t i = 0; i < out->size(); ++i) { - std::shared_ptr data = (*out)[i]->data(); + for (auto& chunk : *out) { + std::shared_ptr data = chunk->data()->Copy(); data->type = ::arrow::utf8(); - (*out)[i] = std::make_shared(data); + chunk = std::make_shared(std::move(data)); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index b0c4fe2fc81..1eb906f91ed 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -204,10 +204,10 @@ class BaseBinaryBuilder } } } else { - for (std::size_t i = 0; i < values.size(); ++i) { + for (const auto& value : values) { UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend( - reinterpret_cast(values[i].data()), values[i].size()); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); } } @@ -463,6 +463,249 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { std::shared_ptr type() const override { return large_utf8(); } }; +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// These builders do not support building raw pointer string view arrays. + +namespace internal { + +// Because we construct StringHeader objects incrementally, resizing buffers is +// not an option as memory addresses for out-of-line strings will change. Thus, +// we allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 1 << 20; // 1MB + + StringHeapBuilder(MemoryPool* pool, int64_t alignment, + int64_t blocksize = kDefaultBlocksize) + : pool_(pool), blocksize_(blocksize), alignment_(alignment) {} + + void UnsafeAppend(StringHeader* raw_not_inlined) { + memcpy(current_out_buffer_, raw_not_inlined->GetRawPointer(), + raw_not_inlined->size()); + raw_not_inlined->SetIndexOffset(static_cast(blocks_.size() - 1), + current_offset_); + current_out_buffer_ += raw_not_inlined->size(); + current_remaining_bytes_ -= raw_not_inlined->size(); + current_offset_ += static_cast(raw_not_inlined->size()); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (num_bytes > current_remaining_bytes_) { + // Ensure the buffer is fully overwritten to avoid leaking uninitialized + // bytes from the allocator + if (current_remaining_bytes_ > 0) { + std::memset(current_out_buffer_, 0, current_remaining_bytes_); + blocks_.back() = SliceBuffer(blocks_.back(), 0, + blocks_.back()->size() - current_remaining_bytes_); + } + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, + AllocateBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + std::vector> Finish() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + MemoryPool* pool_; + const int64_t blocksize_; + int64_t alignment_; + std::vector> blocks_; + + uint32_t current_offset_ = 0; + uint8_t* current_out_buffer_ = NULLPTR; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); + + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool, alignment) {} + + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + if (!StringHeader::IsInline(length)) { + ARROW_RETURN_NOT_OK(ReserveData(length)); + } + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(StringHeader(value, length)); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 4GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + UnsafeAppend(StringHeader(value, length)); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + void UnsafeAppend(StringHeader value) { + UnsafeAppendToBitmap(true); + if (!value.IsInline()) { + // String is stored out-of-line + data_heap_builder_.UnsafeAppend(&value); + } + data_builder_.UnsafeAppend(value); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, StringHeader{}); // zero + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(StringHeader{}); // zero + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(StringHeader{}); // zero + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, StringHeader{}); // zero + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(StringHeader{}); // zero + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(StringHeader{}); // zero + UnsafeAppendToBitmap(true); + } + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + // ---------------------------------------------------------------------- // FixedSizeBinaryBuilder @@ -498,7 +741,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const Buffer& s) { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(std::string_view(s)); + UnsafeAppend(s); return Status::OK(); } @@ -549,7 +792,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } - void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 061fb600412..c99a6faceeb 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -193,6 +193,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu return impl_->GetOrInsert(value, out); } +Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value, + int32_t* out) { + // Create BinaryArray dictionary for now + return impl_->GetOrInsert(value, out); +} + Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf30991..0cc82930a14 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -60,6 +60,12 @@ struct DictionaryValue> { BinaryType, LargeBinaryType>::type; }; +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + template struct DictionaryValue> { using type = std::string_view; @@ -115,6 +121,10 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + // TODO: Consider working StringHeader throughout the hashing machinery to + // benefit from faster comparisons, reduced need to allocate memory + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); + class DictionaryMemoTableImpl; std::unique_ptr impl_; }; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 65c82384369..6e863490480 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -229,6 +229,43 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + for (const auto& in_data : in_) { + auto begin = in_data->buffers.begin() + 2; + auto end = in_data->buffers.end(); + + for (auto it = begin; it != end; ++it) { + out_->buffers.push_back(*it); + } + } + + ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader))); + ARROW_ASSIGN_OR_RAISE(auto header_buffer, ConcatenateBuffers(header_buffers, pool_)); + + if (!type.has_raw_pointers()) { + auto* s = header_buffer->mutable_data_as(); + + size_t preceding_buffer_count = 0; + + int64_t i = in_[0]->length; + for (size_t in_index = 1; in_index < in_.size(); ++in_index) { + preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2; + + for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) { + if (s[i].IsInline()) continue; + auto buffer_index = + static_cast(s[i].GetBufferIndex() + preceding_buffer_count); + s[i].SetIndexOffset(buffer_index, s[i].GetBufferOffset()); + } + } + } + + out_->buffers[1] = std::move(header_buffer); + return Status::OK(); + } + Status Visit(const ListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); @@ -605,6 +642,10 @@ Result> Concatenate(const ArrayVector& arrays, MemoryPool return Status::Invalid("Must pass at least one array"); } + if (arrays.size() == 1) { + return arrays[0]; + } + // gather ArrayData of input arrays ArrayDataVector data(arrays.size()); for (size_t i = 0; i < arrays.size(); ++i) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 4c03fab731f..d9b0b7d235b 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -92,8 +92,14 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + ASSERT_OK(expected->ValidateFull()); auto slices = this->Slices(array, offsets); + for (auto slice : slices) { + ASSERT_OK(slice->ValidateFull()); + } + ASSERT_OK(expected->ValidateFull()); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { @@ -155,6 +161,23 @@ TEST_F(ConcatenateTest, StringType) { }); } +TEST_F(ConcatenateTest, StringViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + ASSERT_OK((**out).ValidateFull()); + }); + + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + const ArrayData& io = *(*out)->data(); + auto raw_buf = AllocateBuffer(io.buffers[1]->size()).ValueOrDie(); + ABORT_NOT_OK( + internal::SwapStringHeaderPointers(io, raw_buf->mutable_data_as())); + (*out)->data()->buffers[1] = std::move(raw_buf); + (*out)->data()->type = utf8_view(/*has_raw_pointers=*/true); + }); +} + TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 8764e9c354c..1118b5d2220 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -186,7 +186,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { } this->offset = data.offset; - for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + for (int i = 0; i < std::min(static_cast(data.buffers.size()), 3); ++i) { const std::shared_ptr& buffer = data.buffers[i]; // It is the invoker-of-kernels's responsibility to ensure that // const buffers are not written to accidentally. @@ -199,7 +199,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { Type::type type_id = this->type->id(); if (type_id == Type::EXTENSION) { - const ExtensionType* ext_type = checked_cast(this->type); + auto* ext_type = checked_cast(this->type); type_id = ext_type->storage_type()->id(); } @@ -214,6 +214,14 @@ void ArraySpan::SetMembers(const ArrayData& data) { this->buffers[i] = {}; } + if (type_id == Type::STRING_VIEW || type_id == Type::BINARY_VIEW) { + // store the span of character buffers in the third buffer + this->buffers[2].data = + const_cast(reinterpret_cast(data.buffers.data() + 2)); + this->buffers[2].size = + static_cast(data.buffers.size() - 2) * sizeof(std::shared_ptr); + } + if (type_id == Type::DICTIONARY) { this->child_data.resize(1); this->child_data[0].SetMembers(*data.dictionary); @@ -248,6 +256,8 @@ int GetNumBuffers(const DataType& type) { case Type::LARGE_BINARY: case Type::STRING: case Type::LARGE_STRING: + case Type::STRING_VIEW: + case Type::BINARY_VIEW: case Type::DENSE_UNION: return 3; case Type::EXTENSION: @@ -330,12 +340,12 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } else if (is_base_binary_like(type_id)) { const auto& scalar = checked_cast(value); this->buffers[1].data = reinterpret_cast(this->scratch_space); - const uint8_t* data_buffer = nullptr; - int64_t data_size = 0; - if (scalar.is_valid) { - data_buffer = scalar.value->data(); - data_size = scalar.value->size(); - } + static auto kEmptyBuffer = Buffer::FromString(""); + const auto& value = scalar.is_valid ? scalar.value : kEmptyBuffer; + this->buffers[2].data = const_cast(value->data()); + this->buffers[2].size = value->size(); + this->buffers[2].owner = &value; + int64_t data_size = this->buffers[2].size; if (is_binary_like(type_id)) { SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), data_size); @@ -344,8 +354,25 @@ void ArraySpan::FillFromScalar(const Scalar& value) { SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), data_size); } - this->buffers[2].data = const_cast(data_buffer); - this->buffers[2].size = data_size; + } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { + const auto& scalar = checked_cast(value); + + this->buffers[1].data = reinterpret_cast(this->scratch_space); + if (scalar.is_valid) { + if (checked_cast(type)->has_raw_pointers()) { + new (this->scratch_space) StringHeader{ + scalar.value->data_as(), static_cast(scalar.value->size())}; + } else { + new (this->scratch_space) StringHeader{ + scalar.value->data_as(), static_cast(scalar.value->size()), 0, + scalar.value->data_as()}; + } + this->buffers[2].data = + const_cast(reinterpret_cast(&scalar.value)); + this->buffers[2].size = sizeof(std::shared_ptr); + } else { + new (this->scratch_space) StringHeader{}; + } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); @@ -687,7 +714,8 @@ struct ViewDataImpl { } RETURN_NOT_OK(CheckInputAvailable()); - const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx]; + const auto& in_layout = in_layouts[in_layout_idx]; + const auto& in_spec = in_layout.buffers[in_buffer_idx]; if (out_spec != in_spec) { return InvalidView("incompatible layouts"); } @@ -698,6 +726,18 @@ struct ViewDataImpl { DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx); out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); ++in_buffer_idx; + + if (in_buffer_idx == in_layout.buffers.size()) { + if (out_layout.variadic_spec != in_layout.variadic_spec) { + return InvalidView("incompatible layouts"); + } + + if (in_layout.variadic_spec) { + for (; in_buffer_idx < in_data_item->buffers.size(); ++in_buffer_idx) { + out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); + } + } + } AdjustInputPointer(); } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 82a6e733727..f7d1adc3e4b 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -384,6 +384,7 @@ struct ARROW_EXPORT ArraySpan { ArraySpan(const ArrayData& data) { // NOLINT implicit conversion SetMembers(data); } + /// Warning: this produces an ArraySpan which cannot be safely moved/copied! explicit ArraySpan(const Scalar& data) { FillFromScalar(data); } /// If dictionary-encoded, put dictionary in the first entry @@ -391,6 +392,7 @@ struct ARROW_EXPORT ArraySpan { /// \brief Populate ArraySpan to look like an array of length 1 pointing at /// the data members of a Scalar value + /// Warning: this produces an ArraySpan which cannot be safely moved/copied! void FillFromScalar(const Scalar& value); void SetMembers(const ArrayData& data); @@ -468,10 +470,12 @@ struct ARROW_EXPORT ArraySpan { void SetSlice(int64_t offset, int64_t length) { this->offset = offset; this->length = length; - if (this->type->id() != Type::NA) { + if (this->type->id() == Type::NA) { + this->null_count = this->length; + } else if (this->MayHaveNulls()) { this->null_count = kUnknownNullCount; } else { - this->null_count = this->length; + this->null_count = 0; } } diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 5245c8d0ff3..acc0534a2ae 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -156,6 +156,37 @@ struct DictionaryTraits> { } }; +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static_assert(std::is_same_v>); + + // Instead of defining a custom memo table for StringView we reuse BinaryType's, + // then convert to string view when we copy data out of the memo table. + static Status GetDictionaryArrayData(MemoryPool* pool, + const std::shared_ptr& type, + const MemoTableType& memo_table, + int64_t start_offset, + std::shared_ptr* out) { + DCHECK(type->id() == Type::STRING_VIEW || type->id() == Type::BINARY_VIEW); + + BinaryViewBuilder builder(pool); + RETURN_NOT_OK(builder.Resize(memo_table.size() - start_offset)); + RETURN_NOT_OK(builder.ReserveData(memo_table.values_size())); + memo_table.VisitValues(static_cast(start_offset), + [&](std::string_view s) { builder.UnsafeAppend(s); }); + RETURN_NOT_OK(builder.FinishInternal(out)); + if (checked_cast(*type).has_raw_pointers()) { + // the builder produces index/offset string views, so swap to raw pointers + RETURN_NOT_OK( + SwapStringHeaderPointers(**out, (*out)->GetMutableValues(1))); + } + (*out)->type = type; + return Status::OK(); + } +}; + template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 07be8176fc0..09a9c4afa31 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -43,6 +43,9 @@ #include "arrow/util/decimal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" +#include "arrow/util/span.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -267,6 +270,47 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + if (type.has_raw_pointers()) { + return Status::Invalid( + "Swapping endianness of binary / string view with raw pointers"); + } + + auto* s = data_->buffers[1]->data_as(); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto* new_s = new_buffer->mutable_data_as(); + + // NOTE: data_->length not trusted (see warning above) + const int64_t length = data_->buffers[1]->size() / sizeof(StringHeader); + + for (int64_t i = 0; i < length; i++) { + auto size = static_cast(s[i].size()); +#if ARROW_LITTLE_ENDIAN + size = bit_util::FromBigEndian(size); +#else + size = bit_util::FromLittleEndian(size); +#endif + if (StringHeader::IsInline(size)) { + new_s[i] = s[i]; + std::memcpy(static_cast(&new_s[i]), &size, sizeof(uint32_t)); + continue; + } + + uint32_t buffer_index = s[i].GetBufferIndex(); + uint32_t offset = s[i].GetBufferOffset(); +#if ARROW_LITTLE_ENDIAN + buffer_index = bit_util::FromBigEndian(buffer_index); + offset = bit_util::FromBigEndian(offset); +#else + buffer_index = bit_util::FromLittleEndian(buffer_index); + offset = bit_util::FromLittleEndian(offset); +#endif + new_s[i] = StringHeader{size, s[i].GetPrefix(), buffer_index, offset}; + } + out_->buffers[1] = std::move(new_buffer); + return Status::OK(); + } + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); @@ -372,6 +416,10 @@ class NullArrayFactory { return MaxOf(sizeof(typename T::offset_type) * (length_ + 1)); } + Status Visit(const BinaryViewType& type) { + return MaxOf(sizeof(StringHeader) * length_); + } + Status Visit(const FixedSizeListType& type) { return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_)); } @@ -491,6 +539,11 @@ class NullArrayFactory { return Status::OK(); } + Status Visit(const BinaryViewType&) { + out_->buffers.resize(2, buffer_); + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& type) { out_->buffers.resize(2, buffer_); @@ -577,7 +630,7 @@ class NullArrayFactory { } MemoryPool* pool_; - std::shared_ptr type_; + const std::shared_ptr& type_; int64_t length_; std::shared_ptr out_; std::shared_ptr buffer_; @@ -638,8 +691,26 @@ class RepeatedArrayFactory { RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer)); auto size = static_cast(value->size()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared::ArrayType>(length_, offsets_buffer, - values_buffer); + out_ = std::make_shared::ArrayType>( + length_, std::move(offsets_buffer), std::move(values_buffer)); + return Status::OK(); + } + + template + enable_if_binary_view_like Visit(const T&) { + const std::shared_ptr& value = + checked_cast::ScalarType&>(scalar_).value; + + StringHeader header{std::string_view{*value}}; + std::shared_ptr header_buffer; + RETURN_NOT_OK(CreateBufferOf(&header, sizeof(header), &header_buffer)); + + BufferVector char_buffers; + if (!header.IsInline()) { + char_buffers.push_back(value); + } + out_ = std::make_shared::ArrayType>( + length_, std::move(header_buffer), std::move(char_buffers)); return Status::OK(); } @@ -934,5 +1005,222 @@ std::vector RechunkArraysConsistently( return rechunked_groups; } +namespace { +Status FromRawPointerStringHeaders(const ArraySpan& raw, + util::span> char_buffers, + StringHeader* io) { + DCHECK_NE(char_buffers.size(), 0); + + auto IsInBuffer = [](const Buffer& buffer, StringHeader s) { + return buffer.data_as() <= s.data() && + buffer.data_as() + buffer.size() >= s.data() + s.size(); + }; + + auto Write = [&](auto find_containing_buffer) { + // Given `find_containing_buffer` which looks up the index of a buffer containing + // a StringHeader, write an equivalent buffer of index/offset string views. + static const Buffer kEmptyBuffer{""}; + const Buffer* buffer_containing_previous_view = &kEmptyBuffer; + uint32_t buffer_index; + + auto* raw_ptr = raw.GetValues(1); + + bool all_valid = true; + VisitNullBitmapInline( + raw.buffers[0].data, raw.offset, raw.length, raw.null_count, + [&] { + // Copied to a local variable, so even if io == raw_ptr + // we can modify safely. + auto s = *raw_ptr++; + + if (!s.IsInline()) { + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + if (ARROW_PREDICT_FALSE(!IsInBuffer(*buffer_containing_previous_view, s))) { + auto found = find_containing_buffer(s); + if (ARROW_PREDICT_FALSE(!found)) { + all_valid = false; + return; + } + // Assume that we're at the start of a run of views into + // char_buffers[buffer_index]; adjust the fast path's pointer accordingly + buffer_index = *found; + buffer_containing_previous_view = char_buffers[buffer_index].get(); + } + + s.SetIndexOffset( + buffer_index, + static_cast( + s.data() - char_buffers[buffer_index]->template data_as())); + } + *io++ = s; + }, + [&] { + ++raw_ptr; + *io++ = {}; + }); + + if (!all_valid) { + return Status::IndexError( + "A header pointed outside the provided character buffers"); + } + return Status::OK(); + }; + + auto LinearSearch = [&](StringHeader s) -> std::optional { + uint32_t buffer_index = 0; + for (const auto& char_buffer : char_buffers) { + if (IsInBuffer(*char_buffer, s)) return buffer_index; + ++buffer_index; + } + return {}; + }; + + if (char_buffers.size() <= 32) { + // If there are few buffers to search through, sorting/binary search is not + // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. + return Write(LinearSearch); + } + + auto sort_indices = ArgSort( + char_buffers, [](const auto& l, const auto& r) { return l->data() < r->data(); }); + + auto first_overlapping = std::adjacent_find( + sort_indices.begin(), sort_indices.end(), [&](uint32_t before, uint32_t after) { + return char_buffers[before]->data() + char_buffers[before]->size() <= + char_buffers[after]->data(); + }); + if (ARROW_PREDICT_FALSE(first_overlapping != sort_indices.end())) { + // Using a binary search with overlapping buffers would not *uniquely* identify + // a potentially-containing buffer. Moreover this should be a fairly rare case + // so optimizing for it seems premature. + return Write(LinearSearch); + } + + auto BinarySearch = [&](StringHeader s) -> std::optional { + // Find the first buffer whose data starts after the data in view- + // only buffers *before* this could contain view. Since we've additionally + // checked that the buffers do not overlap, only the buffer *immediately before* + // this could contain view. + auto one_past_potential_super = + std::upper_bound(sort_indices.begin(), sort_indices.end(), s, + [&](const StringHeader& s, int64_t i) { + return IsInBuffer(*char_buffers[i], s); + }); + + if (ARROW_PREDICT_FALSE(one_past_potential_super == sort_indices.begin())) { + return {}; + } + + uint32_t buffer_index = *(one_past_potential_super - 1); + const Buffer& char_buffer = *char_buffers[buffer_index]; + if (ARROW_PREDICT_TRUE(IsInBuffer(char_buffer, s))) return buffer_index; + + return {}; + }; + + return Write(BinarySearch); +} + +Status ToRawPointerStringHeaders(const ArraySpan& io, + util::span> char_buffers, + StringHeader* raw) { + DCHECK_NE(char_buffers.size(), 0); + + uint32_t buffer_index = 0; + const char* buffer_data = char_buffers[0]->data_as(); + auto* io_ptr = io.GetValues(1); + + bool all_valid = true; + VisitNullBitmapInline( + io.buffers[0].data, io.offset, io.length, io.null_count, + [&] { + // Copied to a local variable, so even if raw == io_ptr + // we can modify safely. + auto s = *io_ptr++; + + if (!s.IsInline()) { + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + if (ARROW_PREDICT_FALSE(s.GetBufferIndex() != buffer_index)) { + if (ARROW_PREDICT_FALSE(s.GetBufferIndex() >= char_buffers.size())) { + all_valid = false; + return; + } + // Assume that we're at the start of a run of views into + // char_buffers[buffer_index]; adjust the fast path's pointer accordingly + buffer_index = s.GetBufferIndex(); + buffer_data = char_buffers[buffer_index]->data_as(); + } + s.SetRawPointer(buffer_data + s.GetBufferOffset()); + } + *raw++ = s; + }, + [&] { + ++io_ptr; + *raw++ = {}; + }); + + if (!all_valid) { + return Status::IndexError("A header pointed outside the provided character buffers"); + } + return Status::OK(); +} +} // namespace + +Status SwapStringHeaderPointers(const ArraySpan& in, StringHeader* out) { + util::span char_buffers{ + reinterpret_cast*>(in.buffers[2].data), + static_cast(in.buffers[2].size / sizeof(std::shared_ptr))}; + + if (char_buffers.size() == 0) { + // If there are no character buffers, then all string views must be inline. + // In this case the buffer does not require swizzling between pointers and + // index/offsets. + auto* in_ptr = in.GetValues(1); + + bool all_inline = true; + VisitNullBitmapInline( + in.buffers[0].data, in.offset, in.length, in.null_count, + [&] { + all_inline = all_inline && in_ptr->IsInline(); + auto s = *in_ptr++; + *out++ = s; + }, + [&] { + ++in_ptr; + *out++ = {}; + }); + if (ARROW_PREDICT_FALSE(!all_inline)) { + return Status::IndexError( + "A header was not inline when no character buffers were provided"); + } + return Status::OK(); + } + + return checked_cast(in.type)->has_raw_pointers() + ? FromRawPointerStringHeaders(in, char_buffers, out) + : ToRawPointerStringHeaders(in, char_buffers, out); +} + +void StringHeadersFromStrings(const ArraySpan& strings, StringHeader* s) { + auto* buffer_data = reinterpret_cast(strings.buffers[2].data); + VisitArraySpanInline( + strings, + [&](std::string_view v) { + *s++ = StringHeader{v.data(), static_cast(v.size()), 0, buffer_data}; + }, + [&] { *s++ = StringHeader{}; }); +} + +void RawPointerStringHeadersFromStrings(const ArraySpan& strings, StringHeader* s) { + VisitArraySpanInline( + strings, + [&](std::string_view v) { + *s++ = StringHeader{v.data(), static_cast(v.size())}; + }, + [&] { *s++ = StringHeader{}; }); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index 6e6c61bd03d..adae067db0c 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -85,5 +85,26 @@ Result> SwapEndianArrayData( ARROW_EXPORT std::vector RechunkArraysConsistently(const std::vector&); +/// Convert between index/offset and raw pointer StringHeaders. +/// +/// This function can be used to overwrite a buffer of StringHeader if desired, +/// IE it is supported for `in.buffers[1].data == out`. +/// +/// Note that calling this function is not necessary if all StringHeaders happen to be +/// Inline; this is usually efficiently detectable by checking for an absence of any +/// character buffers. +/// +/// Will raise IndexError if a header views memory outside the provided character buffers. +ARROW_EXPORT +Status SwapStringHeaderPointers(const ArraySpan& in, StringHeader* out); + +/// Fill a buffer of index/offset StringHeader from a dense string array +ARROW_EXPORT +void StringHeadersFromStrings(const ArraySpan& strings, StringHeader* io); + +/// Fill a buffer of raw pointer StringHeader from a dense string array +ARROW_EXPORT +void RawPointerStringHeadersFromStrings(const ArraySpan& strings, StringHeader* raw); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 0f2bd458357..63f97bf38f9 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -31,41 +31,44 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" +#include "arrow/util/string_header.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } - - template - enable_if_string Visit(const StringType&) { - util::InitializeUTF8(); + template + Status Visit(const T&) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + util::InitializeUTF8(); - int64_t i = 0; - return VisitArraySpanInline( - data, - [&](std::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); + int64_t i = 0; + return VisitArraySpanInline( + data, + [&](std::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } else { + Unreachable("utf-8 validation of non string type"); + } } }; @@ -74,10 +77,7 @@ struct BoundsChecker { int64_t min_value; int64_t max_value; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } + Status Visit(const DataType&) { Unreachable("bounds checking of non integer type"); } template enable_if_integer Visit(const IntegerType&) { @@ -169,6 +169,14 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const StringViewType& type) { + RETURN_NOT_OK(ValidateBinaryView(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } + Status Visit(const Date64Type& type) { RETURN_NOT_OK(ValidateFixedWidthBuffers()); @@ -248,6 +256,8 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } + Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); } + Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } @@ -453,7 +463,14 @@ struct ValidateArrayImpl { return Status::Invalid("Array length is negative"); } - if (data.buffers.size() != layout.buffers.size()) { + if (layout.variadic_spec) { + if (data.buffers.size() < layout.buffers.size()) { + return Status::Invalid("Expected at least ", layout.buffers.size(), + " buffers in array " + "of type ", + type.ToString(), ", got ", data.buffers.size()); + } + } else if (data.buffers.size() != layout.buffers.size()) { return Status::Invalid("Expected ", layout.buffers.size(), " buffers in array " "of type ", @@ -469,7 +486,9 @@ struct ValidateArrayImpl { for (int i = 0; i < static_cast(data.buffers.size()); ++i) { const auto& buffer = data.buffers[i]; - const auto& spec = layout.buffers[i]; + const auto& spec = i < static_cast(layout.buffers.size()) + ? layout.buffers[i] + : *layout.variadic_spec; if (buffer == nullptr) { continue; @@ -595,6 +614,63 @@ struct ValidateArrayImpl { return Status::OK(); } + Status ValidateBinaryView(const BinaryViewType& type) { + int64_t headers_byte_size = data.buffers[1]->size(); + int64_t required_headers = data.length + data.offset; + if (static_cast(headers_byte_size / sizeof(StringHeader)) < + required_headers) { + return Status::Invalid("Header buffer size (bytes): ", headers_byte_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + + if (!full_validation) { + return Status::OK(); + } + + if (type.has_raw_pointers()) { + // TODO(bkietz) It may be preferable to extract the validation logic which is in + // the raw pointer - index/offset view array conversion functions and actually + // validate raw pointer view arrays, rather than giving up here + return Status::OK(); + } + + auto* s = data.GetValues(1); + for (int64_t i = 0; i < data.length; ++i, ++s) { + if (data.IsNull(i)) continue; + + if (s->IsInline()) continue; + + size_t buffer_index = s->GetBufferIndex(); + if (ARROW_PREDICT_FALSE(buffer_index + 2 >= data.buffers.size())) { + return Status::IndexError("String view at slot ", i, " references buffer ", + buffer_index, " but there are only ", + data.buffers.size() - 2, " character buffers"); + } + + size_t begin = s->GetBufferOffset(); + size_t end = begin + s->size(); + const auto& buffer = data.buffers[buffer_index + 2]; + auto size = static_cast(buffer->size()); + if (ARROW_PREDICT_FALSE(end > size)) { + return Status::IndexError("String view at slot ", i, " references range ", begin, + "-", end, " of buffer ", buffer_index, + " but that buffer is only ", size, " bytes long"); + } + + const char* data = buffer->data_as() + begin; + if (ARROW_PREDICT_FALSE( + std::memcmp(data, s->GetInlineData(), StringHeader::kPrefixSize) != 0)) { + return Status::Invalid("String view at slot ", i, " has inlined prefix 0x", + HexEncode(s->GetInlineData(), StringHeader::kPrefixSize), + " but the out-of-line character data begins with 0x", + HexEncode(data, StringHeader::kPrefixSize)); + } + } + + return Status::OK(); + } + template Status ValidateListLike(const ListType& type) { const ArrayData& values = *data.child_data[0]; @@ -796,7 +872,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING); + DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || + data.type->id() == Type::LARGE_STRING); UTF8DataValidator validator{data}; return VisitTypeInline(*data.type, &validator); } @@ -804,5 +881,4 @@ Status ValidateUTF8(const ArrayData& data) { ARROW_EXPORT Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 65f1abda161..ac231e72342 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -186,6 +186,10 @@ class ARROW_EXPORT Buffer { #endif return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR; } + template + const T* data_as() const { + return reinterpret_cast(data()); + } /// \brief Return a writable pointer to the buffer's data /// @@ -203,6 +207,10 @@ class ARROW_EXPORT Buffer { return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast(data_) : NULLPTR; } + template + T* mutable_data_as() { + return reinterpret_cast(mutable_data()); + } /// \brief Return the device address of the buffer's data uintptr_t address() const { return reinterpret_cast(data_); } diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 5f37e552004..ab397485d60 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -117,6 +117,9 @@ class ARROW_EXPORT BufferBuilder { UnsafeAppend(data, length); return Status::OK(); } + Status Append(std::string_view v) { + return Append(v.data(), static_cast(v.size())); + } /// \brief Append copies of a value to the buffer /// @@ -138,6 +141,9 @@ class ARROW_EXPORT BufferBuilder { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } + void UnsafeAppend(std::string_view v) { + UnsafeAppend(v.data(), static_cast(v.size())); + } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast(num_copies)); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index caddbf9db55..a1bc398cca4 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -148,6 +148,8 @@ struct DictionaryBuilderCase { Status Visit(const StringType&) { return CreateFor(); } Status Visit(const LargeBinaryType&) { return CreateFor(); } Status Visit(const LargeStringType&) { return CreateFor(); } + Status Visit(const BinaryViewType&) { return CreateFor(); } + Status Visit(const StringViewType&) { return CreateFor(); } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } Status Visit(const Decimal128Type&) { return CreateFor(); } Status Visit(const Decimal256Type&) { return CreateFor(); } @@ -162,6 +164,11 @@ struct DictionaryBuilderCase { template Status CreateFor() { + if constexpr (is_binary_view_like_type::value) { + if (checked_cast(*value_type).has_raw_pointers()) { + return NotImplemented(*value_type); + } + } using AdaptiveBuilderType = DictionaryBuilder; if (dictionary != nullptr) { out->reset(new AdaptiveBuilderType(dictionary, pool)); @@ -190,7 +197,12 @@ struct DictionaryBuilderCase { struct MakeBuilderImpl { template - enable_if_not_nested Visit(const T&) { + enable_if_not_nested Visit(const T& t) { + if constexpr (is_binary_view_like_type::value) { + if (t.has_raw_pointers()) { + return NotImplemented(); + } + } out.reset(new typename TypeTraits::BuilderType(type, pool)); return Status::OK(); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index df41cd22c9e..c318a54b130 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -261,6 +261,36 @@ class RangeDataEqualsImpl { // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } + // Also matches StringViewType + Status Visit(const BinaryViewType& type) { + auto* left_values = left_.GetValues(1) + left_start_idx_; + auto* right_values = right_.GetValues(1) + right_start_idx_; + if (type.has_raw_pointers()) { + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (left_values[i] != right_values[i]) { + return false; + } + } + return true; + }); + return Status::OK(); + } + + auto* left_buffers = left_.buffers.data() + 2; + auto* right_buffers = right_.buffers.data() + 2; + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (!left_values[i].EqualsIndexOffset(left_buffers, right_values[i], + right_buffers)) { + return false; + } + } + return true; + }); + return Status::OK(); + } + // Also matches LargeStringType Status Visit(const LargeBinaryType& type) { return CompareBinary(type); } @@ -632,6 +662,12 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const BinaryViewType& left) { + const auto& right = checked_cast(right_); + result_ = left.has_raw_pointers() == right.has_raw_pointers(); + return Status::OK(); + } + template enable_if_interval Visit(const T& left) { const auto& right = checked_cast(right_); @@ -802,8 +838,7 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } template - typename std::enable_if::value, Status>::type - Visit(const T& left) { + enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index c18dfa09522..231ce0e4bf4 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -302,6 +302,9 @@ void ComputeDataPreallocate(const DataType& type, case Type::LARGE_LIST: widths->emplace_back(64, /*added_length=*/1); return; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + widths->emplace_back(static_cast(sizeof(StringHeader) * 8)); default: break; } diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index fd554ba3d83..e05989ad7c2 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -175,7 +175,7 @@ std::shared_ptr DurationTypeUnit(TimeUnit::type unit) { class IntegerMatcher : public TypeMatcher { public: - IntegerMatcher() {} + IntegerMatcher() = default; bool Matches(const DataType& type) const override { return is_integer(type.id()); } @@ -194,7 +194,7 @@ std::shared_ptr Integer() { return std::make_shared class PrimitiveMatcher : public TypeMatcher { public: - PrimitiveMatcher() {} + PrimitiveMatcher() = default; bool Matches(const DataType& type) const override { return is_primitive(type.id()); } @@ -213,7 +213,7 @@ std::shared_ptr Primitive() { return std::make_shared BinaryLike() { class LargeBinaryLikeMatcher : public TypeMatcher { public: - LargeBinaryLikeMatcher() {} + LargeBinaryLikeMatcher() = default; bool Matches(const DataType& type) const override { return is_large_binary_like(type.id()); @@ -249,9 +249,35 @@ class LargeBinaryLikeMatcher : public TypeMatcher { std::string ToString() const override { return "large-binary-like"; } }; +std::shared_ptr LargeBinaryLike() { + return std::make_shared(); +} + +class BinaryViewLikeMatcher : public TypeMatcher { + public: + BinaryViewLikeMatcher() = default; + + bool Matches(const DataType& type) const override { + return type.id() == Type::BINARY_VIEW || type.id() == Type::STRING_VIEW; + } + + bool Equals(const TypeMatcher& other) const override { + if (this == &other) { + return true; + } + auto casted = dynamic_cast(&other); + return casted != nullptr; + } + std::string ToString() const override { return "binary-view-like"; } +}; + +std::shared_ptr BinaryViewLike() { + return std::make_shared(); +} + class FixedSizeBinaryLikeMatcher : public TypeMatcher { public: - FixedSizeBinaryLikeMatcher() {} + FixedSizeBinaryLikeMatcher() = default; bool Matches(const DataType& type) const override { return is_fixed_size_binary(type.id()); @@ -267,10 +293,6 @@ class FixedSizeBinaryLikeMatcher : public TypeMatcher { std::string ToString() const override { return "fixed-size-binary-like"; } }; -std::shared_ptr LargeBinaryLike() { - return std::make_shared(); -} - std::shared_ptr FixedSizeBinaryLike() { return std::make_shared(); } diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index a52636aeb6b..7ca2a661839 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -134,6 +134,9 @@ ARROW_EXPORT std::shared_ptr BinaryLike(); // Match types using 64-bit varbinary representation ARROW_EXPORT std::shared_ptr LargeBinaryLike(); +// Match types using the view varbinary representation +ARROW_EXPORT std::shared_ptr BinaryViewLike(); + // Match any fixed binary type ARROW_EXPORT std::shared_ptr FixedSizeBinaryLike(); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 8cf5a04addb..5a671cd05c8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -170,12 +170,6 @@ Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) return Status::OK(); } -Result ResolveOutputFromOptions(KernelContext* ctx, - const std::vector&) { - const CastOptions& options = checked_cast(*ctx->state()).options; - return options.to_type; -} - /// You will see some of kernels with /// /// kOutputTargetType @@ -184,8 +178,10 @@ Result ResolveOutputFromOptions(KernelContext* ctx, /// easiest initial way to get the requested cast type including the TimeUnit /// to the kernel (which is needed to compute the output) was through /// CastOptions - -OutputType kOutputTargetType(ResolveOutputFromOptions); +OutputType kOutputTargetType([](KernelContext* ctx, + const std::vector&) -> Result { + return CastState::Get(ctx).to_type; +}); Status ZeroCopyCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // TODO(wesm): alternative strategy for zero copy casts after ARROW-16576 diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h index 0a57e3381d3..c32a26cc948 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h @@ -71,9 +71,6 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty CastFunction* func); // OutputType::Resolver that returns a type the type from CastOptions -Result ResolveOutputFromOptions(KernelContext* ctx, - const std::vector& args); - ARROW_EXPORT extern OutputType kOutputTargetType; // Add generic casts to out_ty from: diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index a02f83351b3..c9bbcc94a2e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -726,7 +726,7 @@ std::shared_ptr GetCastToFloating(std::string name) { } std::shared_ptr GetCastToDecimal128() { - OutputType sig_out_ty(ResolveOutputFromOptions); + OutputType sig_out_ty = kOutputTargetType; auto func = std::make_shared("cast_decimal", Type::DECIMAL128); AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get()); @@ -761,7 +761,7 @@ std::shared_ptr GetCastToDecimal128() { } std::shared_ptr GetCastToDecimal256() { - OutputType sig_out_ty(ResolveOutputFromOptions); + OutputType sig_out_ty = kOutputTargetType; auto func = std::make_shared("cast_decimal256", Type::DECIMAL256); AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index ebeb597207a..25e79baab35 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -25,8 +25,11 @@ #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/result.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" +#include "arrow/util/span.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8_internal.h" #include "arrow/visit_data_inline.h" @@ -36,8 +39,7 @@ using internal::StringFormatter; using util::InitializeUTF8; using util::ValidateUTF8Inline; -namespace compute { -namespace internal { +namespace compute::internal { namespace { @@ -53,7 +55,7 @@ struct NumericToStringCastFunctor { static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& input = batch[0].array; FormatterType formatter(input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); RETURN_NOT_OK(VisitArraySpanInline( input, [&](value_type v) { @@ -77,7 +79,7 @@ struct DecimalToStringCastFunctor { static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& input = batch[0].array; FormatterType formatter(input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); RETURN_NOT_OK(VisitArraySpanInline( input, [&](std::string_view bytes) { @@ -105,7 +107,7 @@ struct TemporalToStringCastFunctor { static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& input = batch[0].array; FormatterType formatter(input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); RETURN_NOT_OK(VisitArraySpanInline( input, [&](value_type v) { @@ -130,7 +132,7 @@ struct TemporalToStringCastFunctor { const ArraySpan& input = batch[0].array; const auto& timezone = GetInputTimezone(*input.type); const auto& ty = checked_cast(*input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); // Preallocate int64_t string_length = 19; // YYYY-MM-DD HH:MM:SS @@ -284,107 +286,214 @@ Status CastBinaryToBinaryOffsets(KernelContext* ctx, } template -enable_if_base_binary BinaryToBinaryCastExec(KernelContext* ctx, - const ExecSpan& batch, - ExecResult* out) { - const CastOptions& options = checked_cast(*ctx->state()).options; +Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { const ArraySpan& input = batch[0].array; - if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) { + // This presupposes that one was created in the invocation layer + ArrayData* output = out->array_data().get(); + output->SetNullCount(input.null_count); + + const auto& options = CastState::Get(ctx); + bool check_utf8 = !I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8; + if (check_utf8) { InitializeUTF8(); - ArraySpanVisitor visitor; - Utf8Validator validator; - RETURN_NOT_OK(visitor.Visit(input, &validator)); } - // Start with a zero-copy cast, but change indices to expected size - RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); - return CastBinaryToBinaryOffsets( - ctx, input, out->array_data().get()); -} + [[maybe_unused]] auto SimpleUtf8Validation = [&] { + if (check_utf8) { + Utf8Validator validator; + return ArraySpanVisitor::Visit(input, &validator); + } + return Status::OK(); + }; -template -enable_if_t::value && - !std::is_same::value, - Status> -BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const CastOptions& options = checked_cast(*ctx->state()).options; - const ArraySpan& input = batch[0].array; + constexpr bool kInputOffsets = + std::is_base_of_v || std::is_base_of_v; - if (O::is_utf8 && !options.allow_invalid_utf8) { - InitializeUTF8(); - ArraySpanVisitor visitor; - Utf8Validator validator; - RETURN_NOT_OK(visitor.Visit(input, &validator)); + constexpr bool kInputViews = std::is_base_of_v; + + constexpr bool kInputFixed = std::is_same_v; + + constexpr bool kOutputOffsets = + std::is_base_of_v || std::is_base_of_v; + + constexpr bool kOutputViews = std::is_base_of_v; + + constexpr bool kOutputFixed = std::is_same_v; + + if constexpr (kInputOffsets && kOutputOffsets) { + // Start with a zero-copy cast, but change indices to expected size + RETURN_NOT_OK(SimpleUtf8Validation()); + RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); + // FIXME(bkietz) this discards preallocated storage. It seems preferable to me to + // allocate a new null bitmap if necessary than to always allocate new offsets. + return CastBinaryToBinaryOffsets( + ctx, input, out->array_data().get()); } - // Check for overflow - using output_offset_type = typename O::offset_type; - constexpr output_offset_type kMaxOffset = - std::numeric_limits::max(); - const int32_t width = input.type->byte_width(); - const int64_t max_offset = width * input.length; - if (max_offset > kMaxOffset) { - return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", - out->type()->ToString(), ": input array too large"); + if constexpr (kInputViews && kOutputViews) { + return SimpleUtf8Validation() & ZeroCopyCastExec(ctx, batch, out); } - // This presupposes that one was created in the invocation layer - ArrayData* output = out->array_data().get(); + if constexpr (kInputViews && kOutputOffsets) { + if (input.MayHaveNulls()) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } else { + output->buffers[0] = nullptr; + } - // Copy buffers over, then generate indices - output->length = input.length; - output->SetNullCount(input.null_count); - if (input.offset == output->offset) { - output->buffers[0] = input.GetBuffer(0); - } else { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); - } + using offset_type = typename O::offset_type; + + auto* offset = output->buffers[1]->mutable_data_as(); + offset[0] = 0; + + util::span char_buffers{ + reinterpret_cast*>(input.buffers[2].data), + static_cast(input.buffers[2].size / sizeof(std::shared_ptr))}; + int64_t char_count = 0; + for (const auto& buf : char_buffers) { + char_count += buf->size(); + } + BufferBuilder char_builder{ctx->memory_pool()}; + RETURN_NOT_OK(char_builder.Reserve(char_count)); + + RETURN_NOT_OK(VisitArraySpanInline( + input, + [&](std::string_view v) { + if constexpr (std::is_same_v) { + if (ARROW_PREDICT_FALSE( + char_builder.length() + v.size() > + static_cast(std::numeric_limits::max()))) { + return Status::Invalid("Failed casting from ", input.type->ToString(), + " to ", out->type()->ToString(), + ": input array viewed too many characters"); + } + } + offset[1] = static_cast(offset[0] + v.size()); + ++offset; + return char_builder.Append(v); + }, + [&] { + offset[1] = offset[0]; + ++offset; + return Status::OK(); + })); - // This buffer is preallocated - output_offset_type* offsets = output->GetMutableValues(1); - offsets[0] = static_cast(input.offset * width); - for (int64_t i = 0; i < input.length; i++) { - offsets[i + 1] = offsets[i] + width; + RETURN_NOT_OK(SimpleUtf8Validation()); + return char_builder.Finish(&output->buffers[2]); } - // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary - // (index 2). After ARROW-16757, we need to copy this memory instead of - // zero-copy it because a Scalar value promoted to an ArraySpan may be - // referencing a temporary buffer whose scope does not extend beyond the - // kernel execution. In that scenario, the validity bitmap above can be - // zero-copied because it points to static memory (either a byte with a 1 or - // a 0 depending on whether the value is null or not). - std::shared_ptr input_data = input.GetBuffer(1); - if (input_data != nullptr) { - ARROW_ASSIGN_OR_RAISE(output->buffers[2], input_data->CopySlice(0, input_data->size(), - ctx->memory_pool())); - } else { - // TODO(wesm): it should already be nullptr, so we may be able to remove - // this - output->buffers[2] = nullptr; + if constexpr ((kInputOffsets || kInputFixed) && kOutputViews) { + // TODO(bkietz) when outputting views, we *could* output into slices, + // provided we have a threadsafe place to stash accumulated buffers + // of character data. + if (input.MayHaveNulls()) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } else { + output->buffers[0] = nullptr; + } + + // Borrow the input's character buffer + output->buffers.resize(3); + output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2); + auto* buffer_data = output->buffers[2]->data_as(); + + auto* headers = output->buffers[1]->mutable_data_as(); + if (check_utf8) { + Utf8Validator validator; + return VisitArraySpanInline( + input, + [&](std::string_view v) { + new (headers++) + StringHeader{v.data(), static_cast(v.size()), 0, buffer_data}; + return validator.VisitValue(v); + }, + [&] { + *headers++ = StringHeader{}; + return Status::OK(); + }); + } else { + VisitArraySpanInline( + input, + [&](std::string_view v) { + new (headers++) + StringHeader{v.data(), static_cast(v.size()), 0, buffer_data}; + }, + [&] { *headers++ = StringHeader{}; }); + return Status::OK(); + } } - return Status::OK(); -} + if constexpr (kInputFixed && kOutputOffsets) { + RETURN_NOT_OK(SimpleUtf8Validation()); -template -enable_if_t::value && - std::is_same::value, - Status> -BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const CastOptions& options = checked_cast(*ctx->state()).options; - const int32_t in_width = batch[0].type()->byte_width(); - const int32_t out_width = - checked_cast(*options.to_type).byte_width(); - if (in_width != out_width) { - return Status::Invalid("Failed casting from ", batch[0].type()->ToString(), " to ", - options.to_type.ToString(), ": widths must match"); + using output_offset_type = typename O::offset_type; + + int32_t width = input.type->byte_width(); + + if constexpr (std::is_same_v) { + // Check for overflow + if (width * input.length > std::numeric_limits::max()) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + out->type()->ToString(), ": input array too large"); + } + } + + // Copy buffers over, then generate indices + output->length = input.length; + output->SetNullCount(input.null_count); + if (input.offset == output->offset) { + output->buffers[0] = input.GetBuffer(0); + } else { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } + + // This buffer is preallocated + auto* offsets = output->buffers[1]->mutable_data_as(); + offsets[0] = static_cast(input.offset * width); + for (int64_t i = 0; i < input.length; i++) { + offsets[i + 1] = offsets[i] + width; + } + + // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary + // (index 2). After ARROW-16757, we need to copy this memory instead of + // zero-copy it because a Scalar value promoted to an ArraySpan may be + // referencing a temporary buffer whose scope does not extend beyond the + // kernel execution. In that scenario, the validity bitmap above can be + // zero-copied because it points to static memory (either a byte with a 1 or + // a 0 depending on whether the value is null or not). + if (std::shared_ptr input_data = input.GetBuffer(1)) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[2], + input_data->CopySlice(0, input_data->size(), ctx->memory_pool())); + } else { + // TODO(wesm): it should already be nullptr, so we may be able to remove + // this + output->buffers[2] = nullptr; + } + + return Status::OK(); + } + + if constexpr (kInputFixed && kOutputFixed) { + if (input.type->byte_width() != output->type->byte_width()) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + output->type->ToString(), ": widths must match"); + } + return ZeroCopyCastExec(ctx, batch, out); } - return ZeroCopyCastExec(ctx, batch, out); + + Unreachable(); } #if defined(_MSC_VER) @@ -447,6 +556,8 @@ template void AddBinaryToBinaryCast(CastFunction* func) { AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); + AddBinaryToBinaryCast(func); + AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); @@ -459,6 +570,11 @@ std::vector> GetBinaryLikeCasts() { AddCommonCasts(Type::BINARY, binary(), cast_binary.get()); AddBinaryToBinaryCast(cast_binary.get()); + auto cast_binary_view = + std::make_shared("cast_binary_view", Type::BINARY_VIEW); + AddCommonCasts(Type::BINARY_VIEW, binary_view(), cast_binary_view.get()); + AddBinaryToBinaryCast(cast_binary_view.get()); + auto cast_large_binary = std::make_shared("cast_large_binary", Type::LARGE_BINARY); AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get()); @@ -471,6 +587,14 @@ std::vector> GetBinaryLikeCasts() { AddTemporalToStringCasts(cast_string.get()); AddBinaryToBinaryCast(cast_string.get()); + auto cast_string_view = + std::make_shared("cast_string_view", Type::STRING_VIEW); + AddCommonCasts(Type::STRING_VIEW, utf8_view(), cast_string_view.get()); + AddNumberToStringCasts(cast_string_view.get()); + AddDecimalToStringCasts(cast_string_view.get()); + AddTemporalToStringCasts(cast_string_view.get()); + AddBinaryToBinaryCast(cast_string_view.get()); + auto cast_large_string = std::make_shared("cast_large_string", Type::LARGE_STRING); AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get()); @@ -481,17 +605,17 @@ std::vector> GetBinaryLikeCasts() { auto cast_fsb = std::make_shared("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY); - AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions), - cast_fsb.get()); + AddCommonCasts(Type::FIXED_SIZE_BINARY, kOutputTargetType, cast_fsb.get()); DCHECK_OK(cast_fsb->AddKernel( - Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, - OutputType(FirstType), + Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, kOutputTargetType, BinaryToBinaryCastExec, NullHandling::COMPUTED_NO_PREALLOCATE)); - return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb}; + return { + cast_binary, cast_binary_view, cast_large_binary, cast_string, + cast_string_view, cast_large_string, cast_fsb, + }; } -} // namespace internal -} // namespace compute +} // namespace compute::internal } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 083a85eb346..5384008b1b7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -145,7 +145,7 @@ static std::shared_ptr MaskArrayWithNullsAt(std::shared_ptr input, using arrow::internal::Bitmap; Bitmap is_valid(masked->buffers[0], 0, input->length()); - if (auto original = input->null_bitmap()) { + if (const auto& original = input->null_bitmap()) { is_valid.CopyFrom(Bitmap(original, input->offset(), input->length())); } else { is_valid.SetBitsTo(true); @@ -154,7 +154,7 @@ static std::shared_ptr MaskArrayWithNullsAt(std::shared_ptr input, for (int i : indices_to_mask) { is_valid.SetBitTo(i, false); } - return MakeArray(masked); + return MakeArray(std::move(masked)); } TEST(Cast, CanCast) { @@ -167,6 +167,9 @@ TEST(Cast, CanCast) { } }; + ExpectCanCast(boolean(), {utf8()}); + return; + auto ExpectCannotCast = [ExpectCanCast](std::shared_ptr from, std::vector> to_set) { ExpectCanCast(from, to_set, /*expected=*/false); @@ -198,17 +201,21 @@ TEST(Cast, CanCast) { ExpectCannotCast(from_numeric, {null()}); } - for (auto from_base_binary : kBaseBinaryTypes) { - ExpectCanCast(from_base_binary, {boolean()}); - ExpectCanCast(from_base_binary, kNumericTypes); - ExpectCanCast(from_base_binary, kBaseBinaryTypes); - ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary}); + auto base_binary_and_view_types = kBaseBinaryTypes; + base_binary_and_view_types.push_back(binary_view()); + base_binary_and_view_types.push_back(utf8_view()); + + for (auto from : base_binary_and_view_types) { + ExpectCanCast(from, {boolean()}); + ExpectCanCast(from, kNumericTypes); + ExpectCanCast(from, base_binary_and_view_types); + ExpectCanCast(dictionary(int64(), from), {from}); // any cast which is valid for the dictionary is valid for the DictionaryArray - ExpectCanCast(dictionary(uint32(), from_base_binary), kBaseBinaryTypes); - ExpectCanCast(dictionary(int16(), from_base_binary), kNumericTypes); + ExpectCanCast(dictionary(uint32(), from), kBaseBinaryTypes); + ExpectCanCast(dictionary(int16(), from), kNumericTypes); - ExpectCannotCast(from_base_binary, {null()}); + ExpectCannotCast(from, {null()}); } ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)}); @@ -1029,7 +1036,7 @@ TEST(Cast, DecimalToFloating) { } TEST(Cast, DecimalToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) { CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"), ArrayFromJSON(string_type, R"(["0.00", null, "123.45", "999.99"])")); @@ -1548,7 +1555,7 @@ TEST(Cast, TimeZeroCopy) { } TEST(Cast, DateToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(date32(), "[0, null]"), ArrayFromJSON(string_type, R"(["1970-01-01", null])")); CheckCast(ArrayFromJSON(date64(), "[86400000, null]"), @@ -1557,7 +1564,7 @@ TEST(Cast, DateToString) { } TEST(Cast, TimeToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"), ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])")); CheckCast( @@ -1567,7 +1574,7 @@ TEST(Cast, TimeToString) { } TEST(Cast, TimestampToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); @@ -1593,7 +1600,7 @@ TEST(Cast, TimestampToString) { } TEST_F(CastTimezone, TimestampWithZoneToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, @@ -1779,7 +1786,7 @@ TEST(Cast, DurationToDurationMultiplyOverflow) { } TEST(Cast, DurationToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { for (auto unit : TimeUnit::values()) { CheckCast(ArrayFromJSON(duration(unit), "[0, null, 1234567, 2000]"), ArrayFromJSON(string_type, R"(["0", null, "1234567", "2000"])")); @@ -2016,6 +2023,10 @@ TEST(Cast, StringToTimestamp) { } static void AssertBinaryZeroCopy(std::shared_ptr lhs, std::shared_ptr rhs) { + for (auto id : {lhs->type_id(), rhs->type_id()}) { + // views cannot be zero copied + if (id == Type::BINARY_VIEW || id == Type::STRING_VIEW) return; + } // null bitmap and data buffers are always zero-copied AssertBufferSame(*lhs, *rhs, 0); AssertBufferSame(*lhs, *rhs, 2); @@ -2039,8 +2050,9 @@ static void AssertBinaryZeroCopy(std::shared_ptr lhs, std::shared_ptr empty always works CheckCast(ArrayFromJSON(bin_type, "[]"), ArrayFromJSON(string_type, "[]")); @@ -2058,13 +2070,14 @@ TEST(Cast, BinaryToString) { options.allow_invalid_utf8 = true; ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, string_type, options)); ASSERT_RAISES(Invalid, strings->ValidateFull()); + AssertBinaryZeroCopy(invalid_utf8, strings); } } auto from_type = fixed_size_binary(3); auto invalid_utf8 = FixedSizeInvalidUtf8(from_type); - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(string_type, "[]")); // invalid utf-8 masked by a null bit is not an error @@ -2083,9 +2096,12 @@ TEST(Cast, BinaryToString) { // N.B. null buffer is not always the same if input sliced AssertBufferSame(*invalid_utf8, *strings, 0); - // ARROW-16757: we no longer zero copy, but the contents are equal - ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get()); - ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); + if (string_type->id() != Type::STRING_VIEW) { + // ARROW-16757: we no longer zero copy, but the contents are equal + ASSERT_NE(invalid_utf8->data()->buffers[1].get(), + strings->data()->buffers[2].get()); + ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); + } } } @@ -2154,7 +2170,7 @@ TEST(Cast, StringToString) { } TEST(Cast, IntToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), ArrayFromJSON(string_type, R"(["0", "1", "127", "-128", null])")); @@ -2186,7 +2202,7 @@ TEST(Cast, IntToString) { } TEST(Cast, FloatingToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); @@ -2198,7 +2214,7 @@ TEST(Cast, FloatingToString) { } TEST(Cast, BooleanToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(boolean(), "[true, true, false, null]"), ArrayFromJSON(string_type, R"(["true", "true", "false", null])")); } diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b..44d3f3a447c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -862,6 +862,9 @@ TEST(MakeStruct, Array) { EXPECT_THAT(MakeStructor({i32, str}, {"i", "s"}), ResultWith(Datum(*StructArray::Make({i32, str}, field_names)))); + EXPECT_THAT(*MakeScalar("aa"), testing::Eq(StringScalar("aa"))); + EXPECT_EQ(*MakeStructor({i32, MakeScalar("aa")}, {"i", "s"})->type(), + StructType({field("i", i32->type()), field("s", str->type())})); // Scalars are broadcast to the length of the arrays EXPECT_THAT(MakeStructor({i32, MakeScalar("aa")}, {"i", "s"}), ResultWith(Datum(*StructArray::Make({i32, str}, field_names)))); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 4581e6377a7..50daec6cbaa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -48,7 +48,6 @@ namespace compute { template class BaseTestStringKernels : public ::testing::Test { protected: - using OffsetType = typename TypeTraits::OffsetType; using ScalarType = typename TypeTraits::ScalarType; void CheckUnary(std::string func_name, std::string json_input, @@ -98,7 +97,14 @@ class BaseTestStringKernels : public ::testing::Test { } std::shared_ptr offset_type() { - return TypeTraits::type_singleton(); + if constexpr (is_binary_view_like_type::value) { + // Views do not have offsets, but Functions like binary_length + // will return the length as uint32 + return uint32(); + } else { + using OffsetType = typename TypeTraits::OffsetType; + return TypeTraits::type_singleton(); + } } template diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc index 1499554a960..07fd0a4e27c 100644 --- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc @@ -33,6 +33,7 @@ #include "arrow/util/bitmap.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/unreachable.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -77,14 +78,35 @@ struct PartitionNthToIndices { } const auto p = PartitionNulls( out_begin, out_end, arr, 0, options.null_placement); + auto nth_begin = out_begin + pivot; if (nth_begin >= p.non_nulls_begin && nth_begin < p.non_nulls_end) { - std::nth_element(p.non_nulls_begin, nth_begin, p.non_nulls_end, - [&arr](uint64_t left, uint64_t right) { - const auto lval = GetView::LogicalValue(arr.GetView(left)); - const auto rval = GetView::LogicalValue(arr.GetView(right)); - return lval < rval; - }); + if constexpr (is_binary_view_like_type::value) { + const StringHeader* headers = arr.raw_values(); + const auto* char_buffers = arr.data()->buffers.data() + 2; + + auto Partition = [&](auto has_raw_pointers) { + std::nth_element(p.non_nulls_begin, nth_begin, p.non_nulls_end, + [&](uint64_t left, uint64_t right) { + const auto& lval = headers[left]; + const auto& rval = headers[right]; + if constexpr (has_raw_pointers.value) return lval < rval; + // we must compare the views in place with the array's + // character buffers + return lval.LessThanIndexOffset(char_buffers, rval, + char_buffers); + }); + }; + arr.has_raw_pointers() ? Partition(/*has_raw_pointers=*/std::true_type{}) + : Partition(/*has_raw_pointers=*/std::false_type{}); + } else { + std::nth_element(p.non_nulls_begin, nth_begin, p.non_nulls_end, + [&arr](uint64_t left, uint64_t right) { + const auto lval = GetView::LogicalValue(arr.GetView(left)); + const auto rval = GetView::LogicalValue(arr.GetView(right)); + return lval < rval; + }); + } } return Status::OK(); } @@ -173,6 +195,53 @@ class ArrayCompareSorter { } }; +template <> +class ArrayCompareSorter { + public: + Result operator()(uint64_t* indices_begin, uint64_t* indices_end, + const Array& array, int64_t offset, + const ArraySortOptions& options, ExecContext*) { + const auto& values = checked_cast(array); + const StringHeader* headers = values.raw_values(); + + const auto p = PartitionNulls( + indices_begin, indices_end, values, offset, options.null_placement); + const auto* char_buffers = values.data()->buffers.data() + 2; + + auto Sort = [&](auto has_raw_pointers, auto ascending) { + std::stable_sort( + p.non_nulls_begin, p.non_nulls_end, [&](uint64_t left, uint64_t right) { + const auto& lhs = headers[left - offset]; + const auto& rhs = headers[right - offset]; + + if constexpr (has_raw_pointers.value) { + if constexpr (ascending.value) return lhs < rhs; + if constexpr (!ascending.value) return rhs < lhs; + } + + // we must compare the views in place with the array's + // character buffers + if constexpr (ascending.value) { + return lhs.LessThanIndexOffset(char_buffers, rhs, char_buffers); + } else { + return rhs.LessThanIndexOffset(char_buffers, lhs, char_buffers); + } + }); + }; + + if (options.order == SortOrder::Ascending) { + values.has_raw_pointers() + ? Sort(/*has_raw_pointers=*/std::true_type{}, /*ascending=*/std::true_type{}) + : Sort(/*has_raw_pointers=*/std::false_type{}, /*ascending=*/std::true_type{}); + } else { + values.has_raw_pointers() + ? Sort(/*has_raw_pointers=*/std::true_type{}, /*ascending=*/std::false_type{}) + : Sort(/*has_raw_pointers=*/std::false_type{}, /*ascending=*/std::false_type{}); + } + return p; + } +}; + template <> class ArrayCompareSorter { public: @@ -501,6 +570,11 @@ struct ArraySorter< ArrayCompareSorter impl; }; +template <> +struct ArraySorter { + ArrayCompareSorter impl; +}; + struct ArraySorterFactory { ArraySortFunc sorter; @@ -594,6 +668,15 @@ void AddArraySortingKernels(VectorKernel base, VectorFunction* func) { base.exec = GenerateVarBinaryBase(*physical_type); DCHECK_OK(func->AddKernel(base)); } + + base.signature = KernelSignature::Make({utf8_view()}, uint64()); + base.exec = ExecTemplate::Exec; + DCHECK_OK(func->AddKernel(base)); + + base.signature = KernelSignature::Make({binary_view()}, uint64()); + base.exec = ExecTemplate::Exec; + DCHECK_OK(func->AddKernel(base)); + base.signature = KernelSignature::Make({Type::FIXED_SIZE_BINARY}, uint64()); base.exec = ExecTemplate::Exec; DCHECK_OK(func->AddKernel(base)); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 2eab7ae8afa..9f20b640271 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -261,7 +262,7 @@ class HashKernel : public KernelState { // Base class for all "regular" hash kernel implementations // (NullType has a separate implementation) -template class RegularHashKernel : public HashKernel { public: @@ -501,39 +502,13 @@ class DictionaryHashKernel : public HashKernel { }; // ---------------------------------------------------------------------- - -template -struct HashKernelTraits {}; - -template -struct HashKernelTraits> { - using HashKernel = NullHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -Result> HashInitImpl(KernelContext* ctx, - const KernelInitArgs& args) { - using HashKernelType = typename HashKernelTraits::HashKernel; - auto result = std::make_unique(args.inputs[0].GetSharedPtr(), - args.options, ctx->memory_pool()); - RETURN_NOT_OK(result->Reset()); - return std::move(result); -} - -template +template Result> HashInit(KernelContext* ctx, const KernelInitArgs& args) { - return HashInitImpl(ctx, args); + auto result = std::make_unique(args.inputs[0].GetSharedPtr(), args.options, + ctx->memory_pool()); + RETURN_NOT_OK(result->Reset()); + return std::move(result); } template @@ -542,22 +517,22 @@ KernelInit GetHashInit(Type::type type_id) { // representation switch (type_id) { case Type::NA: - return HashInit; + return HashInit>; case Type::BOOL: - return HashInit; + return HashInit>; case Type::INT8: case Type::UINT8: - return HashInit; + return HashInit>; case Type::INT16: case Type::UINT16: - return HashInit; + return HashInit>; case Type::INT32: case Type::UINT32: case Type::FLOAT: case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: - return HashInit; + return HashInit>; case Type::INT64: case Type::UINT64: case Type::DOUBLE: @@ -566,22 +541,23 @@ KernelInit GetHashInit(Type::type type_id) { case Type::TIMESTAMP: case Type::DURATION: case Type::INTERVAL_DAY_TIME: - return HashInit; + return HashInit>; case Type::BINARY: case Type::STRING: - return HashInit; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return HashInit>; case Type::LARGE_BINARY: case Type::LARGE_STRING: - return HashInit; + return HashInit>; case Type::FIXED_SIZE_BINARY: case Type::DECIMAL128: case Type::DECIMAL256: - return HashInit; + return HashInit>; case Type::INTERVAL_MONTH_DAY_NANO: - return HashInit; + return HashInit>; default: - DCHECK(false); - return nullptr; + Unreachable("non hashable type"); } } @@ -591,31 +567,11 @@ template Result> DictionaryHashInit(KernelContext* ctx, const KernelInitArgs& args) { const auto& dict_type = checked_cast(*args.inputs[0].type); - Result> indices_hasher; - switch (dict_type.index_type()->id()) { - case Type::INT8: - case Type::UINT8: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT16: - case Type::UINT16: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT32: - case Type::UINT32: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT64: - case Type::UINT64: - indices_hasher = HashInitImpl(ctx, args); - break; - default: - DCHECK(false) << "Unsupported dictionary index type"; - break; - } - RETURN_NOT_OK(indices_hasher); - return std::make_unique(std::move(indices_hasher.ValueOrDie()), - dict_type.value_type()); + ARROW_ASSIGN_OR_RAISE(auto indices_hasher, + GetHashInit(dict_type.index_type()->id())(ctx, args)); + return std::make_unique( + checked_pointer_cast(std::move(indices_hasher)), + dict_type.value_type()); } Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 64c3db204c9..d8b5d83fe9d 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -45,6 +45,8 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/int_util.h" +#include "arrow/util/range.h" +#include "arrow/util/span.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index 25e30e65a35..5289b317110 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -26,8 +26,7 @@ #include "arrow/testing/random.h" #include "arrow/util/benchmark_util.h" -namespace arrow { -namespace compute { +namespace arrow::compute { constexpr auto kSeed = 0x0ff1ce; @@ -135,6 +134,13 @@ struct TakeBenchmark { Bench(values); } + void StringView() { + int32_t string_min_length = 0, string_max_length = 32; + auto values = rand.StringView(args.size, string_min_length, string_max_length, + args.null_proportion); + Bench(values); + } + void Bench(const std::shared_ptr& values) { double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0; auto indices = @@ -196,6 +202,21 @@ struct FilterBenchmark { Bench(values); } + void StringView() { + int32_t string_min_length = 0, string_max_length = 32; + int32_t string_mean_length = (string_max_length + string_min_length) / 2; + // for an array of 50% null strings, we need to generate twice as many strings + // to ensure that they have an average of args.size total characters + int64_t array_size = args.size; + if (args.values_null_proportion < 1) { + array_size = static_cast(args.size / string_mean_length / + (1 - args.values_null_proportion)); + } + auto values = std::static_pointer_cast(rand.StringView( + array_size, string_min_length, string_max_length, args.values_null_proportion)); + Bench(values); + } + void Bench(const std::shared_ptr& values) { auto filter = rand.Boolean(values->length(), args.selected_proportion, args.filter_null_proportion); @@ -263,6 +284,14 @@ static void FilterStringFilterWithNulls(benchmark::State& state) { FilterBenchmark(state, true).String(); } +static void FilterStringViewFilterNoNulls(benchmark::State& state) { + FilterBenchmark(state, false).StringView(); +} + +static void FilterStringViewFilterWithNulls(benchmark::State& state) { + FilterBenchmark(state, true).StringView(); +} + static void FilterRecordBatchNoNulls(benchmark::State& state) { FilterBenchmark(state, false).BenchRecordBatch(); } @@ -304,7 +333,19 @@ static void TakeStringRandomIndicesWithNulls(benchmark::State& state) { } static void TakeStringMonotonicIndices(benchmark::State& state) { - TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true).FSLInt64(); + TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true).String(); +} + +static void TakeStringViewRandomIndicesNoNulls(benchmark::State& state) { + TakeBenchmark(state, false).StringView(); +} + +static void TakeStringViewRandomIndicesWithNulls(benchmark::State& state) { + TakeBenchmark(state, true).StringView(); +} + +static void TakeStringViewMonotonicIndices(benchmark::State& state) { + TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true).StringView(); } void FilterSetArgs(benchmark::internal::Benchmark* bench) { @@ -321,6 +362,8 @@ BENCHMARK(FilterFSLInt64FilterNoNulls)->Apply(FilterSetArgs); BENCHMARK(FilterFSLInt64FilterWithNulls)->Apply(FilterSetArgs); BENCHMARK(FilterStringFilterNoNulls)->Apply(FilterSetArgs); BENCHMARK(FilterStringFilterWithNulls)->Apply(FilterSetArgs); +BENCHMARK(FilterStringViewFilterNoNulls)->Apply(FilterSetArgs); +BENCHMARK(FilterStringViewFilterWithNulls)->Apply(FilterSetArgs); void FilterRecordBatchSetArgs(benchmark::internal::Benchmark* bench) { for (auto num_cols : std::vector({10, 50, 100})) { @@ -349,6 +392,8 @@ BENCHMARK(TakeFSLInt64MonotonicIndices)->Apply(TakeSetArgs); BENCHMARK(TakeStringRandomIndicesNoNulls)->Apply(TakeSetArgs); BENCHMARK(TakeStringRandomIndicesWithNulls)->Apply(TakeSetArgs); BENCHMARK(TakeStringMonotonicIndices)->Apply(TakeSetArgs); +BENCHMARK(TakeStringViewRandomIndicesNoNulls)->Apply(TakeSetArgs); +BENCHMARK(TakeStringViewRandomIndicesWithNulls)->Apply(TakeSetArgs); +BENCHMARK(TakeStringViewMonotonicIndices)->Apply(TakeSetArgs); -} // namespace compute -} // namespace arrow +} // namespace arrow::compute diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index ea9ee8a102e..d88f816efc7 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -39,6 +39,8 @@ #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/range.h" +#include "arrow/util/span.h" namespace arrow { @@ -149,12 +151,11 @@ class DropNullCounter { /// generate one take function for each byte width. We use the same /// implementation here for boolean and fixed-byte-size inputs with some /// template specialization. -template +template ::value, + uint8_t, typename ArrowType::c_type>> class PrimitiveFilterImpl { public: - using T = typename std::conditional::value, - uint8_t, typename ArrowType::c_type>::type; - PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, ArrayData* out_arr) @@ -206,7 +207,7 @@ class PrimitiveFilterImpl { } else { bit_util::SetBitsTo(out_is_valid_, out_offset_ + out_position_, segment_length, false); - memset(out_data_ + out_offset_ + out_position_, 0, + memset(static_cast(out_data_ + out_offset_ + out_position_), 0, segment_length * sizeof(T)); out_position_ += segment_length; } @@ -228,7 +229,7 @@ class PrimitiveFilterImpl { } else { bit_util::SetBitsTo(out_is_valid_, out_offset_ + out_position_, segment_length, false); - memset(out_data_ + out_offset_ + out_position_, 0, + memset(static_cast(out_data_ + out_offset_ + out_position_), 0, segment_length * sizeof(T)); out_position_ += segment_length; } @@ -536,7 +537,7 @@ Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values, // Append offsets for (int64_t i = 0; i < length; ++i) { offset_builder.UnsafeAppend(offset); - offset += raw_offsets[i + position + 1] - raw_offsets[i + position + 1]; + offset += raw_offsets[i + position + 1] - raw_offsets[i + position]; } return Status::OK(); }; @@ -824,6 +825,45 @@ Status BinaryFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* o return Status::OK(); } +Status BinaryViewFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + const ArraySpan& filter = batch[1].array; + const bool is_ree_filter = filter.type->id() == Type::RUN_END_ENCODED; + FilterOptions::NullSelectionBehavior null_selection = + FilterState::Get(ctx).null_selection_behavior; + + int64_t output_length = GetFilterOutputSize(filter, null_selection); + + ArrayData* out_arr = out->array_data().get(); + + const bool filter_null_count_is_zero = + is_ree_filter ? filter.child_data[1].null_count == 0 : filter.null_count == 0; + + // The output precomputed null count is unknown except in the narrow + // condition that all the values are non-null and the filter will not cause + // any new nulls to be created. + if (values.null_count == 0 && + (null_selection == FilterOptions::DROP || filter_null_count_is_zero)) { + out_arr->null_count = 0; + } else { + out_arr->null_count = kUnknownNullCount; + } + + // When neither the values nor filter is known to have any nulls, we will + // elect the optimized ExecNonNull path where there is no need to populate a + // validity bitmap. + const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; + + RETURN_NOT_OK(PreallocatePrimitiveArrayData( + ctx, output_length, sizeof(StringHeader) * CHAR_BIT, allocate_validity, out_arr)); + + PrimitiveFilterImpl(values, filter, null_selection, + out_arr) + .Exec(); + CloneBinaryViewCharacterBuffers(values, out_arr); + return Status::OK(); +} + // ---------------------------------------------------------------------- // Null filter @@ -1037,6 +1077,7 @@ void PopulateFilterKernels(std::vector* out) { {InputType(match::Primitive()), plain_filter, PrimitiveFilterExec}, {InputType(match::BinaryLike()), plain_filter, BinaryFilterExec}, {InputType(match::LargeBinaryLike()), plain_filter, BinaryFilterExec}, + {InputType(match::BinaryViewLike()), plain_filter, BinaryViewFilterExec}, {InputType(Type::FIXED_SIZE_BINARY), plain_filter, FSBFilterExec}, {InputType(null()), plain_filter, NullFilterExec}, {InputType(Type::DECIMAL128), plain_filter, FSBFilterExec}, @@ -1054,6 +1095,7 @@ void PopulateFilterKernels(std::vector* out) { {InputType(match::Primitive()), ree_filter, PrimitiveFilterExec}, {InputType(match::BinaryLike()), ree_filter, BinaryFilterExec}, {InputType(match::LargeBinaryLike()), ree_filter, BinaryFilterExec}, + {InputType(match::BinaryViewLike()), ree_filter, BinaryViewFilterExec}, {InputType(Type::FIXED_SIZE_BINARY), ree_filter, FSBFilterExec}, {InputType(null()), ree_filter, NullFilterExec}, {InputType(Type::DECIMAL128), ree_filter, FSBFilterExec}, diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index 23b8b75bfa0..d205ef0fe33 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -39,7 +39,9 @@ #include "arrow/util/bit_util.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" +#include "arrow/util/range.h" #include "arrow/util/ree_util.h" +#include "arrow/util/span.h" namespace arrow { @@ -917,6 +919,21 @@ Status MapTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return TakeExec>(ctx, batch, out); } +void CloneBinaryViewCharacterBuffers(const ArraySpan& values, ArrayData* out_arr) { + // copy the character buffers into the output + util::span char_buffers( + reinterpret_cast*>(values.buffers[2].data), + values.buffers[2].size / sizeof(std::shared_ptr)); + + out_arr->buffers.resize(char_buffers.size() + 2); + auto out_char_buffers = util::span(out_arr->buffers).subspan(2); + + for (auto&& [out_char_buffer, char_buffer] : + arrow::internal::Zip(out_char_buffers, char_buffers)) { + out_char_buffer = char_buffer; + } +} + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index bcffdd820db..63692f3a173 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -85,6 +85,8 @@ Status DenseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status StructTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status MapTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +void CloneBinaryViewCharacterBuffers(const ArraySpan&, ArrayData*); + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index ab80127731c..f5dc0e29b66 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -48,8 +48,7 @@ using internal::BitBlockCounter; using internal::CheckIndexBounds; using internal::OptionalBitBlockCounter; -namespace compute { -namespace internal { +namespace compute::internal { namespace { @@ -386,7 +385,9 @@ struct PrimitiveTakeImpl { ++position; } } else { - memset(out + position, 0, sizeof(ValueCType) * block.length); + memset(static_cast( + out + position), // silence "StringHeader is non-trivial" warning + 0, sizeof(ValueCType) * block.length); position += block.length; } } else { @@ -423,7 +424,9 @@ struct PrimitiveTakeImpl { ++position; } } else { - memset(out + position, 0, sizeof(ValueCType) * block.length); + memset(static_cast( + out + position), // silence "StringHeader is non-trivial" warning + 0, sizeof(ValueCType) * block.length); position += block.length; } } @@ -595,6 +598,24 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* return Status::OK(); } +Status BinaryViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + const ArraySpan& indices = batch[1].array; + + if (TakeState::Get(ctx).boundscheck) { + RETURN_NOT_OK(CheckIndexBounds(indices, values.length)); + } + + ArrayData* out_arr = out->array_data().get(); + RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, + 8 * sizeof(StringHeader), + /*allocate_validity=*/true, out_arr)); + TakeIndexDispatch(values, indices, out_arr); + + CloneBinaryViewCharacterBuffers(values, out_arr); + return Status::OK(); +} + // ---------------------------------------------------------------------- // Null take @@ -834,6 +855,7 @@ void PopulateTakeKernels(std::vector* out) { {InputType(match::Primitive()), take_indices, PrimitiveTakeExec}, {InputType(match::BinaryLike()), take_indices, VarBinaryTakeExec}, {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec}, + {InputType(match::BinaryViewLike()), take_indices, BinaryViewTakeExec}, {InputType(Type::FIXED_SIZE_BINARY), take_indices, FSBTakeExec}, {InputType(null()), take_indices, NullTakeExec}, {InputType(Type::DECIMAL128), take_indices, FSBTakeExec}, @@ -849,6 +871,5 @@ void PopulateTakeKernels(std::vector* out) { }; } -} // namespace internal -} // namespace compute +} // namespace compute::internal } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index 5b624911ff5..e7c9efadc11 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -593,7 +593,7 @@ TYPED_TEST(TestFilterKernelWithDecimal, FilterNumeric) { ArrayFromJSON(boolean(), "[]"), this->drop_)); } -TEST(TestFilterKernel, NoValidityBitmapButUnknownNullCount) { +TEST_F(TestFilterKernel, NoValidityBitmapButUnknownNullCount) { auto values = ArrayFromJSON(int32(), "[1, 2, 3, 4]"); auto filter = ArrayFromJSON(boolean(), "[true, true, false, true]"); @@ -644,6 +644,14 @@ TYPED_TEST(TestFilterKernelWithString, FilterString) { this->AssertFilter(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])"); } +TEST_F(TestFilterKernel, FilterStringView) { + for (auto type : {utf8_view(), binary_view()}) { + AssertFilter(type, R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])"); + AssertFilter(type, R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])"); + AssertFilter(type, R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])"); + } +} + TYPED_TEST(TestFilterKernelWithString, FilterDictionary) { auto dict = R"(["a", "b", "c", "d", "e"])"; this->AssertFilterDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]"); @@ -1292,6 +1300,24 @@ TYPED_TEST(TestTakeKernelWithString, TakeString) { "[2, 5]", &arr)); } +TEST_F(TestTakeKernel, TakeStringView) { + for (auto type : {utf8_view(), binary_view()}) { + ARROW_SCOPED_TRACE(*type); + CheckTake(type, R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])"); + CheckTake(type, R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]"); + CheckTake(type, R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])"); + + this->TestNoValidityBitmapButUnknownNullCount(type, R"(["a", "b", "c"])", + "[0, 1, 0]"); + + std::shared_ptr arr; + ASSERT_RAISES(IndexError, + TakeJSON(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr)); + ASSERT_RAISES(IndexError, TakeJSON(type, R"(["a", "b", null, "ddd", "ee"])", int64(), + "[2, 5]", &arr)); + } +} + TYPED_TEST(TestTakeKernelWithString, TakeDictionary) { auto dict = R"(["a", "b", "c", "d", "e"])"; this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]"); @@ -1897,6 +1923,7 @@ TEST(TestFilter, RandomBoolean) { FilterRandomTest<>::Test(boolean()); } TEST(TestFilter, RandomString) { FilterRandomTest<>::Test(utf8()); FilterRandomTest<>::Test(large_utf8()); + FilterRandomTest<>::Test(utf8_view()); } TEST(TestFilter, RandomFixedSizeBinary) { @@ -1911,6 +1938,7 @@ TEST(TestTake, RandomBoolean) { TakeRandomTest::Test(boolean()); } TEST(TestTake, RandomString) { TakeRandomTest::Test(utf8()); TakeRandomTest::Test(large_utf8()); + TakeRandomTest::Test(utf8_view()); } TEST(TestTake, RandomFixedSizeBinary) { @@ -2046,12 +2074,21 @@ TYPED_TEST_SUITE(TestDropNullKernelWithString, BaseBinaryArrowTypes); TYPED_TEST(TestDropNullKernelWithString, DropNullString) { this->AssertDropNull(R"(["a", "b", "c"])", R"(["a", "b", "c"])"); - this->AssertDropNull(R"([null, "b", "c"])", "[\"b\", \"c\"]"); + this->AssertDropNull(R"([null, "b", "c"])", R"(["b", "c"])"); this->AssertDropNull(R"(["a", "b", null])", R"(["a", "b"])"); this->TestNoValidityBitmapButUnknownNullCount(this->value_type(), R"(["a", "b", "c"])"); } +TEST_F(TestDropNullKernel, DropNullStringView) { + for (auto type : {utf8_view(), binary_view()}) { + CheckDropNull(type, R"(["a", "b", "c"])", R"(["a", "b", "c"])"); + CheckDropNull(type, R"([null, "b", "c"])", R"(["b", "c"])"); + CheckDropNull(type, R"(["a", "b", null])", R"(["a", "b"])"); + this->TestNoValidityBitmapButUnknownNullCount(type, R"(["a", "b", "c"])"); + } +} + TYPED_TEST(TestDropNullKernelWithString, DropNullDictionary) { auto dict = R"(["a", "b", "c", "d", "e"])"; this->AssertDropNullDictionary(dict, "[3, 4, 2]", "[3, 4, 2]"); diff --git a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc index 0cefddff5a1..39cfa085912 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc @@ -25,8 +25,7 @@ #include "arrow/util/benchmark_util.h" #include "arrow/util/logging.h" -namespace arrow { -namespace compute { +namespace arrow::compute { constexpr auto kSeed = 0x0ff1ce; constexpr int32_t kDictionarySize = 24; // a typical dictionary size @@ -136,6 +135,21 @@ static void ArraySortFuncStringBenchmark(benchmark::State& state, const Runner& ArraySortFuncBenchmark(state, runner, values); } +template +static void ArraySortFuncStringViewBenchmark(benchmark::State& state, + const Runner& runner, int32_t min_length, + int32_t max_length) { + RegressionArgs args(state); + + const int64_t array_size = + GetStringArraySize(args.size, min_length, max_length, args.null_proportion); + + auto rand = random::RandomArrayGenerator(kSeed); + auto values = rand.StringView(array_size, min_length, max_length, args.null_proportion); + + ArraySortFuncBenchmark(state, runner, values); +} + template static void ArraySortFuncStringDictBenchmark(benchmark::State& state, const Runner& runner, int32_t min_length, @@ -271,6 +285,18 @@ static void ArraySortIndicesStringWideDict(benchmark::State& state) { dict_size); } +static void ArraySortIndicesStringViewNarrow(benchmark::State& state) { + const auto min_length = 0; + const auto max_length = 16; + ArraySortFuncStringViewBenchmark(state, SortRunner(state), min_length, max_length); +} + +static void ArraySortIndicesStringViewWide(benchmark::State& state) { + const auto min_length = 0; + const auto max_length = 64; + ArraySortFuncStringViewBenchmark(state, SortRunner(state), min_length, max_length); +} + static void ArrayRankStringNarrow(benchmark::State& state) { const auto min_length = 0; const auto max_length = 16; @@ -458,6 +484,8 @@ BENCHMARK(ArraySortIndicesBool)->Apply(ArraySortIndicesSetArgs); BENCHMARK(ArraySortIndicesStringNarrow)->Apply(ArraySortIndicesSetArgs); BENCHMARK(ArraySortIndicesStringWide)->Apply(ArraySortIndicesSetArgs); BENCHMARK(ArraySortIndicesStringWideDict)->Apply(ArraySortIndicesSetArgs); +BENCHMARK(ArraySortIndicesStringViewNarrow)->Apply(ArraySortIndicesSetArgs); +BENCHMARK(ArraySortIndicesStringViewWide)->Apply(ArraySortIndicesSetArgs); BENCHMARK(ChunkedArraySortIndicesInt64Narrow)->Apply(ArraySortIndicesSetArgs); BENCHMARK(ChunkedArraySortIndicesInt64Wide)->Apply(ArraySortIndicesSetArgs); @@ -534,5 +562,4 @@ BENCHMARK(ArrayRankStringWide)->Apply(ArrayRankSetArgs); BENCHMARK(ChunkedArrayRankInt64Narrow)->Apply(ArrayRankSetArgs); BENCHMARK(ChunkedArrayRankInt64Wide)->Apply(ArrayRankSetArgs); -} // namespace compute -} // namespace arrow +} // namespace arrow::compute diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 3429a5a8785..2bbee76172b 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -593,7 +593,7 @@ template class TestArraySortIndicesForTemporal : public TestArraySortIndices {}; TYPED_TEST_SUITE(TestArraySortIndicesForTemporal, TemporalArrowTypes); -using StringSortTestTypes = testing::Types; +using StringSortTestTypes = testing::Types; template class TestArraySortIndicesForStrings : public TestArraySortIndices {}; diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 5e214bdda4d..c595051dec9 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -710,6 +710,15 @@ struct ScalarToProtoImpl { s); } + Status Visit(const StringViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_string(std::move(s)); }, + s); + } + Status Visit(const BinaryViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_binary(std::move(s)); }, + s); + } + Status Visit(const FixedSizeBinaryScalar& s) { return FromBuffer( [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 03d1f999a14..89692df7bed 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -262,6 +262,13 @@ struct DataTypeToProtoImpl { return SetWith(&substrait::Type::set_allocated_binary); } + Status Visit(const StringViewType& t) { + return SetWith(&::substrait::Type::set_allocated_string); + } + Status Visit(const BinaryViewType& t) { + return SetWith(&::substrait::Type::set_allocated_binary); + } + Status Visit(const FixedSizeBinaryType& t) { SetWithThen(&substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); return Status::OK(); diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b6d3a3d7d8c..1ef076fac40 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -536,8 +536,8 @@ struct ArrayWriterV1 { is_nested_type::value || is_null_type::value || is_decimal_type::value || std::is_same::value || is_duration_type::value || is_interval_type::value || is_fixed_size_binary_type::value || - std::is_same::value || std::is_same::value || - std::is_same::value, + is_binary_view_like_type::value || std::is_same::value || + std::is_same::value || std::is_same::value, Status>::type Visit(const T& type) { return Status::NotImplemented(type.ToString()); diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index e1d4282cb26..0b6ae4f6206 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -264,7 +264,8 @@ TEST_P(TestFeather, TimeTypes) { TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckRoundtrip(batch); } @@ -306,7 +307,8 @@ TEST_P(TestFeather, SliceFloatRoundTrip) { TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckSlices(batch); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index eea0c973028..4d2d803f3f6 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -847,6 +847,8 @@ Status GetDictConverter(const std::shared_ptr& type, PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) + PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, FixedSizeBinaryType) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) @@ -905,6 +907,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 6eee5955242..11d296f24f5 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -327,6 +327,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestString, TestStrings, StringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestBinary, TestStrings, BinaryType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeString, TestStrings, LargeStringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinary, TestStrings, LargeBinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringView, TestStrings, StringViewType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryView, TestStrings, BinaryViewType); TEST(TestNull, Basics) { std::shared_ptr type = null(); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 1394516ecd5..23accc2390f 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -258,6 +258,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeBinary: *out = large_binary(); return Status::OK(); + case flatbuf::Type::BinaryView: + *out = binary_view(); + return Status::OK(); case flatbuf::Type::FixedSizeBinary: { auto fw_binary = static_cast(type_data); return FixedSizeBinaryType::Make(fw_binary->byteWidth()).Value(out); @@ -268,6 +271,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeUtf8: *out = large_utf8(); return Status::OK(); + case flatbuf::Type::Utf8View: + *out = utf8_view(); + return Status::OK(); case flatbuf::Type::Bool: *out = boolean(); return Status::OK(); @@ -534,6 +540,26 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + static Status CheckForRawPointers(const BinaryViewType& type) { + if (type.has_raw_pointers()) { + return Status::NotImplemented( + type.ToString(), " cannot be serialized; convert to index/offset format first"); + } + return Status::OK(); + } + + Status Visit(const BinaryViewType& type) { + fb_type_ = flatbuf::Type::BinaryView; + type_offset_ = flatbuf::CreateBinaryView(fbb_).Union(); + return CheckForRawPointers(type); + } + + Status Visit(const StringViewType& type) { + fb_type_ = flatbuf::Type::Utf8View; + type_offset_ = flatbuf::CreateUtf8View(fbb_).Union(); + return CheckForRawPointers(type); + } + Status Visit(const LargeBinaryType& type) { fb_type_ = flatbuf::Type::LargeBinary; type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); @@ -820,7 +846,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos, dictionary_id = encoding->id(); } - // 4. Is it an extension type? + // 4. Is it an extension or view type? if (metadata != nullptr) { // Look for extension metadata in custom_metadata field int name_index = metadata->FindKey(kExtensionTypeKeyName); @@ -967,6 +993,7 @@ static Status GetBodyCompression(FBB& fbb, const IpcWriteOptions& options, static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, + const std::vector& variadic_counts, const IpcWriteOptions& options, RecordBatchOffset* offset) { FieldNodeVector fb_nodes; RETURN_NOT_OK(WriteFieldNodes(fbb, nodes, &fb_nodes)); @@ -977,7 +1004,10 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, BodyCompressionOffset fb_compression; RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression)); - *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression); + auto fb_variadic_counts = fbb.CreateVector(variadic_counts); + + *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression, + fb_variadic_counts); return Status::OK(); } @@ -1224,11 +1254,12 @@ Status WriteRecordBatchMessage( int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, variadic_counts, + options, &record_batch)); return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(), body_length, options.metadata_version, custom_metadata, options.memory_pool) @@ -1285,11 +1316,12 @@ Status WriteDictionaryMessage( int64_t id, bool is_delta, int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, variadic_counts, + options, &record_batch)); auto dictionary_batch = flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union(); return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index abbed5b2dac..631a336f75a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -201,7 +201,8 @@ Status WriteRecordBatchMessage( const int64_t length, const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); ARROW_EXPORT Result> WriteTensorMessage(const Tensor& tensor, @@ -225,7 +226,8 @@ Status WriteDictionaryMessage( const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); static inline Result> WriteFlatbufferBuilder( flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 7de81eff7a7..9fb37d4b9f3 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -159,7 +159,7 @@ TEST_P(TestMessage, SerializeCustomMetadata) { ASSERT_OK(internal::WriteRecordBatchMessage( /*length=*/0, /*body_length=*/0, metadata, /*nodes=*/{}, - /*buffers=*/{}, options_, &serialized)); + /*buffers=*/{}, /*variadic_counts=*/{}, options_, &serialized)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, Message::Open(serialized, /*body=*/nullptr)); @@ -240,23 +240,33 @@ class TestSchemaMetadata : public ::testing::Test { } }; -const std::shared_ptr INT32 = std::make_shared(); - TEST_F(TestSchemaMetadata, PrimitiveFields) { - auto f0 = field("f0", std::make_shared()); - auto f1 = field("f1", std::make_shared(), false); - auto f2 = field("f2", std::make_shared()); - auto f3 = field("f3", std::make_shared()); - auto f4 = field("f4", std::make_shared()); - auto f5 = field("f5", std::make_shared()); - auto f6 = field("f6", std::make_shared()); - auto f7 = field("f7", std::make_shared()); - auto f8 = field("f8", std::make_shared()); - auto f9 = field("f9", std::make_shared(), false); - auto f10 = field("f10", std::make_shared()); - - Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", int8()), + field("f1", int16(), false), + field("f2", int32()), + field("f3", int64()), + field("f4", uint8()), + field("f5", uint16()), + field("f6", uint32()), + field("f7", uint64()), + field("f8", float32()), + field("f9", float64(), false), + field("f10", boolean()), + })); +} + +TEST_F(TestSchemaMetadata, BinaryFields) { + CheckSchemaRoundtrip(Schema({ + field("f0", utf8()), + field("f1", binary()), + field("f2", large_utf8()), + field("f3", large_binary()), + field("f4", utf8_view()), + field("f5", binary_view()), + field("f6", fixed_size_binary(3)), + field("f7", fixed_size_binary(33)), + })); } TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { @@ -269,15 +279,14 @@ TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { } TEST_F(TestSchemaMetadata, NestedFields) { - auto type = list(int32()); - auto f0 = field("f0", type); - - std::shared_ptr type2( - new StructType({field("k1", INT32), field("k2", INT32), field("k3", INT32)})); - auto f1 = field("f1", type2); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", list(int32())), + field("f1", struct_({ + field("k1", int32()), + field("k2", int32()), + field("k3", int32()), + })), + })); } // Verify that nullable=false is well-preserved for child fields of map type. @@ -305,19 +314,15 @@ TEST_F(TestSchemaMetadata, NestedFieldsWithKeyValueMetadata) { TEST_F(TestSchemaMetadata, DictionaryFields) { { - auto dict_type = dictionary(int8(), int32(), true /* ordered */); - auto f0 = field("f0", dict_type); - auto f1 = field("f1", list(dict_type)); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + auto dict_type = dictionary(int8(), int32(), /*ordered=*/true); + CheckSchemaRoundtrip(Schema({ + field("f0", dict_type), + field("f1", list(dict_type)), + })); } { auto dict_type = dictionary(int8(), list(int32())); - auto f0 = field("f0", dict_type); - - Schema schema({f0}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } } @@ -325,9 +330,7 @@ TEST_F(TestSchemaMetadata, NestedDictionaryFields) { { auto inner_dict_type = dictionary(int8(), int32(), /*ordered=*/true); auto dict_type = dictionary(int16(), list(inner_dict_type)); - - Schema schema({field("f0", dict_type)}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } { auto dict_type1 = dictionary(int8(), utf8(), /*ordered=*/true); @@ -2870,21 +2873,21 @@ void GetReadRecordBatchReadRanges( // 1) read magic and footer length IO // 2) read footer IO // 3) read record batch metadata IO - ASSERT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); + EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); const int32_t magic_size = static_cast(strlen(ipc::internal::kArrowMagicBytes)); // read magic and footer length IO auto file_end_size = magic_size + sizeof(int32_t); auto footer_length_offset = buffer->size() - file_end_size; auto footer_length = bit_util::FromLittleEndian( util::SafeLoadAs(buffer->data() + footer_length_offset)); - ASSERT_EQ(read_ranges[0].length, file_end_size); + EXPECT_EQ(read_ranges[0].length, file_end_size); // read footer IO - ASSERT_EQ(read_ranges[1].length, footer_length); + EXPECT_EQ(read_ranges[1].length, footer_length); // read record batch metadata. The exact size is tricky to determine but it doesn't // matter for this test and it should be smaller than the footer. - ASSERT_LT(read_ranges[2].length, footer_length); + EXPECT_LE(read_ranges[2].length, footer_length); for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { - ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); } } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 694cc732253..efe81ce573e 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -58,6 +58,7 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/ubsan.h" #include "arrow/util/vector.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" #include "generated/File_generated.h" // IWYU pragma: export @@ -243,6 +244,15 @@ class ArrayLoader { } } + Result GetVariadicCount(int i) { + auto* variadic_counts = metadata_->variadicCounts(); + CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicCounts"); + if (i >= static_cast(variadic_counts->size())) { + return Status::IOError("variadic_count_index out of range."); + } + return static_cast(variadic_counts->Get(i)); + } + Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); @@ -289,7 +299,6 @@ class ArrayLoader { return Status::OK(); } - template Status LoadBinary(Type::type type_id) { out_->buffers.resize(3); @@ -345,7 +354,22 @@ class ArrayLoader { template enable_if_base_binary Visit(const T& type) { - return LoadBinary(type.id()); + return LoadBinary(type.id()); + } + + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + + ARROW_ASSIGN_OR_RAISE(auto character_buffer_count, + GetVariadicCount(variadic_count_index_++)); + out_->buffers.resize(character_buffer_count + 2); + for (size_t i = 0; i < character_buffer_count; ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + } + return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { @@ -440,6 +464,7 @@ class ArrayLoader { int buffer_index_ = 0; int field_index_ = 0; bool skip_io_ = false; + int variadic_count_index_ = 0; BatchDataReadRequest read_request_; const Field* field_ = nullptr; @@ -570,9 +595,9 @@ Result> LoadRecordBatchSubset( // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { - for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { - ARROW_ASSIGN_OR_RAISE(filtered_columns[i], - arrow::internal::SwapEndianArrayData(filtered_columns[i])); + for (auto& filtered_column : filtered_columns) { + ARROW_ASSIGN_OR_RAISE(filtered_column, + arrow::internal::SwapEndianArrayData(filtered_column)); } } return RecordBatch::Make(std::move(filtered_schema), metadata->length(), diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 53721c0b20f..6faaf96b332 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -351,39 +351,32 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls return builder.Finish(out); } -Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { +Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, + bool with_view_types) { const int64_t length = 500; - auto f0 = field("strings", utf8()); - auto f1 = field("binaries", binary()); - auto f2 = field("large_strings", large_utf8()); - auto f3 = field("large_binaries", large_binary()); - auto schema = ::arrow::schema({f0, f1, f2, f3}); - - std::shared_ptr a0, a1, a2, a3; - MemoryPool* pool = default_memory_pool(); - // Quirk with RETURN_NOT_OK macro and templated functions - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a0); - RETURN_NOT_OK(s); + ArrayVector arrays; + FieldVector fields; + + auto AppendColumn = [&](auto& MakeArray) { + arrays.emplace_back(); + RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); + + const auto& type = arrays.back()->type(); + fields.push_back(field(type->ToString(), type)); + return Status::OK(); + }; + + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + if (with_view_types) { + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); } - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a1); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a2); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a3); - RETURN_NOT_OK(s); - } - *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); + + *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 5e0c65556c6..fc0c8ddbea3 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -96,7 +96,7 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatch(std::shared_ptr* out, - bool with_nulls = true); + bool with_nulls = true, bool with_view_types = true); ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index f0f0e96ee46..70d66ebfaaf 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -52,10 +52,12 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/endian.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" #include "arrow/visit_array_inline.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -174,7 +176,8 @@ class RecordBatchSerializer { // Override this for writing dictionary metadata virtual Status SerializeMetadata(int64_t num_rows) { return WriteRecordBatchMessage(num_rows, out_->body_length, custom_metadata_, - field_nodes_, buffer_meta_, options_, &out_->metadata); + field_nodes_, buffer_meta_, variadic_counts_, options_, + &out_->metadata); } bool ShouldCompress(int64_t uncompressed_size, int64_t compressed_size) const { @@ -291,6 +294,8 @@ class RecordBatchSerializer { offset += size + padding; } + variadic_counts_ = out_->variadic_counts; + out_->body_length = offset - buffer_start_offset_; DCHECK(bit_util::IsMultipleOf8(out_->body_length)); @@ -398,6 +403,18 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + auto headers = SliceBuffer(array.values(), array.offset() * sizeof(StringHeader), + array.length() * sizeof(StringHeader)); + out_->body_buffers.emplace_back(std::move(headers)); + + out_->variadic_counts.emplace_back(array.data()->buffers.size() - 2); + for (size_t i = 2; i < array.data()->buffers.size(); ++i) { + out_->body_buffers.emplace_back(array.data()->buffers[i]); + } + return Status::OK(); + } + template enable_if_base_list Visit(const T& array) { using offset_type = typename T::offset_type; @@ -585,6 +602,7 @@ class RecordBatchSerializer { std::vector field_nodes_; std::vector buffer_meta_; + std::vector variadic_counts_; const IpcWriteOptions& options_; int64_t max_recursion_depth_; @@ -601,8 +619,8 @@ class DictionarySerializer : public RecordBatchSerializer { Status SerializeMetadata(int64_t num_rows) override { return WriteDictionaryMessage(dictionary_id_, is_delta_, num_rows, out_->body_length, - custom_metadata_, field_nodes_, buffer_meta_, options_, - &out_->metadata); + custom_metadata_, field_nodes_, buffer_meta_, + variadic_counts_, options_, &out_->metadata); } Status Assemble(const std::shared_ptr& dictionary) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 9e18a213ba3..0b62c011d88 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -57,6 +57,7 @@ struct IpcPayload { MessageType type = MessageType::NONE; std::shared_ptr metadata; std::vector> body_buffers; + std::vector variadic_counts; int64_t body_length = 0; // serialized body length (padded, maybe compressed) int64_t raw_body_length = 0; // initial uncompressed body length }; diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 04ebe4714ce..c393b77acf3 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -304,6 +304,8 @@ Status MakeConverter(const std::shared_ptr& out_type, MemoryPool* pool CONVERTER_CASE(Type::STRING, BinaryConverter); CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter); CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter); + CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter); + CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter); CONVERTER_CASE(Type::DECIMAL128, DecimalConverter); CONVERTER_CASE(Type::DECIMAL256, DecimalConverter); default: diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 0f7b3466fdb..f7ab6fd1027 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,8 +110,7 @@ struct GenerateImpl { return OK(writer.Double(val)); } - template - enable_if_base_binary Visit(const T&) { + Status GenerateAscii(const DataType&) { auto size = std::poisson_distribution<>{4}(e); std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 std::string s(size, '\0'); @@ -119,6 +118,13 @@ struct GenerateImpl { return OK(writer.String(s.c_str())); } + template + enable_if_base_binary Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + template enable_if_list_like Visit(const T& t) { auto size = std::poisson_distribution<>{4}(e); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 0537ddafe29..c20f7396990 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -263,6 +263,12 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } + Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); } + + Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); } + + Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); } + Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template @@ -548,17 +554,8 @@ Status Scalar::ValidateFull() const { return ScalarValidateImpl(/*full_validation=*/true).Validate(*this); } -BinaryScalar::BinaryScalar(std::string s) - : BinaryScalar(Buffer::FromString(std::move(s))) {} - -StringScalar::StringScalar(std::string s) - : StringScalar(Buffer::FromString(std::move(s))) {} - -LargeBinaryScalar::LargeBinaryScalar(std::string s) - : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} - -LargeStringScalar::LargeStringScalar(std::string s) - : LargeStringScalar(Buffer::FromString(std::move(s))) {} +BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index d23b33e28f7..97a6b4787d4 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -37,6 +37,7 @@ #include "arrow/type_traits.h" #include "arrow/util/compare.h" #include "arrow/util/decimal.h" +#include "arrow/util/string_header.h" #include "arrow/util/visibility.h" #include "arrow/visit_type_inline.h" @@ -254,22 +255,20 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { return value ? std::string_view(*value) : std::string_view(); } - protected: BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} + + BaseBinaryScalar(std::string s, std::shared_ptr type); }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; - BinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s); + explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} }; @@ -281,11 +280,39 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr value) : StringScalar(std::move(value), utf8()) {} - explicit StringScalar(std::string s); + explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {} StringScalar() : StringScalar(utf8()) {} }; +struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { + using BaseBinaryScalar::BaseBinaryScalar; + using TypeClass = BinaryViewType; + + explicit BinaryViewScalar(std::shared_ptr value) + : BinaryViewScalar(std::move(value), binary_view()) {} + + explicit BinaryViewScalar(std::string s) + : BaseBinaryScalar(std::move(s), binary_view()) {} + + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} + + std::string_view view() const override { return std::string_view(*this->value); } +}; + +struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using BinaryViewScalar::BinaryViewScalar; + using TypeClass = StringViewType; + + explicit StringViewScalar(std::shared_ptr value) + : StringViewScalar(std::move(value), utf8_view()) {} + + explicit StringViewScalar(std::string s) + : BinaryViewScalar(std::move(s), utf8_view()) {} + + StringViewScalar() : StringViewScalar(utf8_view()) {} +}; + struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; @@ -296,7 +323,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} - explicit LargeBinaryScalar(std::string s); + explicit LargeBinaryScalar(std::string s) + : BaseBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} }; @@ -308,7 +336,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { explicit LargeStringScalar(std::shared_ptr value) : LargeStringScalar(std::move(value), large_utf8()) {} - explicit LargeStringScalar(std::string s); + explicit LargeStringScalar(std::string s) + : LargeBinaryScalar(std::move(s), large_utf8()) {} LargeStringScalar() : LargeStringScalar(large_utf8()) {} }; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 13fc0b3e81d..35b60174178 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -176,10 +176,17 @@ using DecimalArrowTypes = ::testing::Types; using BaseBinaryArrowTypes = ::testing::Types; +using BaseBinaryOrBinaryViewLikeArrowTypes = + ::testing::Types; + using BinaryArrowTypes = ::testing::Types; using StringArrowTypes = ::testing::Types; +using StringOrStringViewArrowTypes = + ::testing::Types; + using ListArrowTypes = ::testing::Types; using UnionArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc index babff621b1f..b97a103c2b1 100644 --- a/cpp/src/arrow/testing/json_internal.cc +++ b/cpp/src/arrow/testing/json_internal.cc @@ -46,6 +46,8 @@ #include "arrow/util/formatting.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/range.h" +#include "arrow/util/span.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/visit_array_inline.h" @@ -105,6 +107,11 @@ std::string GetTimeUnitName(TimeUnit::type unit) { return "UNKNOWN"; } +std::string_view GetStringView(const rj::Value& str) { + DCHECK(str.IsString()); + return {str.GetString(), str.GetStringLength()}; +} + class SchemaWriter { public: explicit SchemaWriter(const Schema& schema, const DictionaryFieldMapper& mapper, @@ -227,8 +234,9 @@ class SchemaWriter { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value || is_base_list_type::value || - is_struct_type::value || is_run_end_encoded_type::value> + is_base_binary_type::value || is_binary_view_like_type::value || + is_base_list_type::value || is_struct_type::value || + is_run_end_encoded_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -386,6 +394,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binaryview", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -532,22 +542,19 @@ class ArrayWriter { } } - // Binary, encode to hexadecimal. - template - enable_if_binary_like WriteDataValues( - const ArrayType& arr) { - for (int64_t i = 0; i < arr.length(); ++i) { - writer_->String(HexEncode(arr.GetView(i))); - } - } - - // UTF8 string, write as is - template - enable_if_string_like WriteDataValues( - const ArrayType& arr) { + template + std::enable_if_t::value || + is_fixed_size_binary_type::value> + WriteDataValues(const ArrayType& arr) { for (int64_t i = 0; i < arr.length(); ++i) { - auto view = arr.GetView(i); - writer_->String(view.data(), static_cast(view.size())); + if constexpr (Type::is_utf8) { + // UTF8 string, write as is + auto view = arr.GetView(i); + writer_->String(view.data(), static_cast(view.size())); + } else { + // Binary, encode to hexadecimal. + writer_->String(HexEncode(arr.GetView(i))); + } } } @@ -646,6 +653,48 @@ class ArrayWriter { writer_->EndArray(); } + template + void WriteStringHeaderField(const ArrayType& array) { + writer_->Key(kData); + writer_->StartArray(); + for (int64_t i = 0; i < array.length(); ++i) { + auto s = array.raw_values()[i]; + writer_->StartObject(); + writer_->Key("SIZE"); + writer_->Int64(s.size()); + if (s.IsInline()) { + writer_->Key("INLINED"); + if constexpr (IsUtf8) { + writer_->String(s.GetInlineData(), StringHeader::kInlineSize); + } else { + writer_->String(HexEncode(s.GetInlineData(), StringHeader::kInlineSize)); + } + } else { + writer_->Key("PREFIX"); + writer_->String(HexEncode(s.GetPrefix().data(), StringHeader::kPrefixSize)); + writer_->Key("BUFFER_INDEX"); + writer_->Int64(s.GetBufferIndex()); + writer_->Key("OFFSET"); + writer_->Int64(s.GetBufferOffset()); + } + writer_->EndObject(); + } + writer_->EndArray(); + } + + void WriteVariadicBuffersField(const BinaryViewArray& arr) { + writer_->Key("VARIADIC_BUFFERS"); + writer_->StartArray(); + const auto& buffers = arr.data()->buffers; + for (size_t i = 2; i < buffers.size(); ++i) { + // Encode the character buffers into hexadecimal strings. + // Even for arrays which contain utf-8, portions of the buffer not + // referenced by any view may be invalid. + writer_->String(buffers[i]->ToHexString()); + } + writer_->EndArray(); + } + void WriteValidityField(const Array& arr) { writer_->Key("VALIDITY"); writer_->StartArray(); @@ -686,8 +735,10 @@ class ArrayWriter { } template - enable_if_t::value, Status> Visit( - const ArrayType& array) { + enable_if_t::value && + !is_binary_view_like_type::value, + Status> + Visit(const ArrayType& array) { WriteValidityField(array); WriteDataField(array); SetNoChildren(); @@ -704,6 +755,21 @@ class ArrayWriter { return Status::OK(); } + template + enable_if_binary_view_like Visit( + const ArrayType& array) { + if (array.has_raw_pointers()) { + return Status::NotImplemented("serialization of ", array.type()->ToString()); + } + + WriteValidityField(array); + WriteStringHeaderField(array); + WriteVariadicBuffersField(array); + + SetNoChildren(); + return Status::OK(); + } + Status Visit(const DictionaryArray& array) { return VisitArrayValues(*array.indices()); } @@ -1065,6 +1131,10 @@ Status GetType(const RjObject& json_type, *type = utf8(); } else if (type_name == "binary") { *type = binary(); + } else if (type_name == "utf8view") { + *type = utf8_view(); + } else if (type_name == "binaryview") { + *type = binary_view(); } else if (type_name == "largeutf8") { *type = large_utf8(); } else if (type_name == "largebinary") { @@ -1341,7 +1411,7 @@ class ArrayReader { int64_t offset_end = ParseOffset(json_offsets[i + 1]); DCHECK(offset_end >= offset_start); - if (T::is_utf8) { + if constexpr (T::is_utf8) { auto str = val.GetString(); DCHECK(std::string(str).size() == static_cast(offset_end - offset_start)); RETURN_NOT_OK(builder.Append(str)); @@ -1367,6 +1437,93 @@ class ArrayReader { return FinishBuilder(&builder); } + template + enable_if_binary_view_like Visit(const ViewType& type) { + ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_)); + ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs, + GetMemberArray(obj_, "VARIADIC_BUFFERS")); + + using internal::Zip; + using util::span; + + BufferVector buffers; + buffers.resize(json_variadic_bufs.Size() + 2); + for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) { + auto hex_string = GetStringView(json_buf); + ARROW_ASSIGN_OR_RAISE( + buf, AllocateBuffer(static_cast(hex_string.size()) / 2, pool_)); + RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data())); + } + + TypedBufferBuilder validity_builder{pool_}; + RETURN_NOT_OK(validity_builder.Resize(length_)); + for (bool is_valid : is_valid_) { + validity_builder.UnsafeAppend(is_valid); + } + ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish()); + + ARROW_ASSIGN_OR_RAISE(buffers[1], + AllocateBuffer(length_ * sizeof(StringHeader), pool_)); + + span headers{buffers[1]->mutable_data_as(), + static_cast(length_)}; + + int64_t null_count = 0; + for (auto [json_view, header, is_valid] : Zip(json_views, headers, is_valid_)) { + if (!is_valid) { + header = {}; + ++null_count; + continue; + } + + DCHECK(json_view.IsObject()); + const auto& json_view_obj = json_view.GetObject(); + + auto json_size = json_view_obj.FindMember("SIZE"); + RETURN_NOT_INT("SIZE", json_size, json_view_obj); + auto size = static_cast(json_size->value.GetInt64()); + + if (StringHeader::IsInline(size)) { + auto json_inlined = json_view_obj.FindMember("INLINED"); + RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj); + if constexpr (ViewType::is_utf8) { + DCHECK_EQ(json_inlined->value.GetStringLength(), StringHeader::kInlineSize); + header = StringHeader{json_inlined->value.GetString(), size}; + } else { + DCHECK_EQ(json_inlined->value.GetStringLength(), StringHeader::kInlineSize * 2); + std::array inlined; + RETURN_NOT_OK(ParseHexValues(GetStringView(json_inlined->value), + reinterpret_cast(inlined.data()))); + header = StringHeader{inlined.data(), size}; + } + continue; + } + + auto json_prefix = json_view_obj.FindMember("PREFIX"); + auto json_buffer_index = json_view_obj.FindMember("BUFFER_INDEX"); + auto json_offset = json_view_obj.FindMember("OFFSET"); + RETURN_NOT_STRING("PREFIX", json_prefix, json_view_obj); + RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj); + RETURN_NOT_INT("OFFSET", json_offset, json_view_obj); + + std::array prefix; + DCHECK_EQ(json_prefix->value.GetStringLength(), StringHeader::kPrefixSize * 2); + RETURN_NOT_OK(ParseHexValues(GetStringView(json_prefix->value), + reinterpret_cast(prefix.data()))); + + header = StringHeader{size, prefix, + static_cast(json_buffer_index->value.GetInt64()), + static_cast(json_offset->value.GetInt64())}; + + DCHECK_LE(header.GetBufferIndex(), buffers.size() - 2); + DCHECK_LE(static_cast(header.GetBufferOffset() + header.size()), + buffers[header.GetBufferIndex() + 2]->size()); + } + + data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count); + return Status::OK(); + } + Status Visit(const DayTimeIntervalType& type) { DayTimeIntervalBuilder builder(pool_); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b8ea247a437..33b618b4a23 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -362,13 +362,12 @@ std::shared_ptr RandomArrayGenerator::Decimal256(std::shared_ptr +template static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { - using offset_type = typename TypeClass::offset_type; using BuilderType = typename TypeTraits::BuilderType; using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; @@ -386,7 +385,7 @@ static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int /*null_probability=*/0); std::vector str_buffer(max_length); - BuilderType builder(memory_pool, alignment); + BuilderType builder{memory_pool, alignment}; for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -429,6 +428,15 @@ std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( return *strings->View(binary()); } +std::shared_ptr RandomArrayGenerator::StringView(int64_t size, int32_t min_length, + int32_t max_length, + double null_probability, + int64_t alignment, + MemoryPool* memory_pool) { + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, alignment, memory_pool); +} + std::shared_ptr RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { @@ -842,6 +850,23 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t ->View(field.type()); } + case Type::type::STRING_VIEW: + case Type::type::BINARY_VIEW: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + + auto out = StringView(length, min_length, max_length, null_probability, alignment); + + if (internal::checked_cast(*field.type()) + .has_raw_pointers()) { + ABORT_NOT_OK(internal::SwapStringHeaderPointers( + *out->data(), out->data()->buffers[1]->mutable_data_as())); + } + return out->View(field.type()).ValueOrDie(); + } + case Type::type::DECIMAL128: return Decimal128(field.type(), length, null_probability, alignment, memory_pool); diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 1bd189c39c2..999afdc60fd 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random StringViewArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] null_probability the probability of a value being null + /// + /// \return a generated Array + std::shared_ptr StringView(int64_t size, int32_t min_length, int32_t max_length, + double null_probability = 0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeStringArray /// /// \param[in] size the size of the array to generate diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 2f6d6354b28..cff6cdd1354 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -160,6 +160,7 @@ auto values = ::testing::Values( field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), field("int64", int64()), field("float16", float16()), field("float32", float32()), field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("string_view", utf8_view()), field("binary_view", binary_view()), field("fixed_size_binary", fixed_size_binary(8)), field("decimal128", decimal128(8, 3)), field("decimal128", decimal128(29, -5)), field("decimal256", decimal256(16, 4)), field("decimal256", decimal256(57, -6)), diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 4f4b03438fd..703593450e8 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -53,6 +53,14 @@ Status CopyBufferFromVector(const std::vector& values, MemoryPool* pool, return Status::OK(); } +template +Result> CopyBufferFromVector( + const std::vector& values, MemoryPool* pool = default_memory_pool()) { + std::shared_ptr out; + RETURN_NOT_OK(CopyBufferFromVector(values, pool, &out)); + return out; +} + // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. ARROW_TESTING_EXPORT void random_null_bytes(int64_t n, double pct_null, diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 68dc2aabe96..e6125aae368 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -60,10 +60,14 @@ constexpr Type::type FixedSizeListType::type_id; constexpr Type::type BinaryType::type_id; +constexpr Type::type BinaryViewType::type_id; + constexpr Type::type LargeBinaryType::type_id; constexpr Type::type StringType::type_id; +constexpr Type::type StringViewType::type_id; + constexpr Type::type LargeStringType::type_id; constexpr Type::type FixedSizeBinaryType::type_id; @@ -126,6 +130,8 @@ std::vector AllTypeIds() { Type::BINARY, Type::LARGE_STRING, Type::LARGE_BINARY, + Type::STRING_VIEW, + Type::BINARY_VIEW, Type::FIXED_SIZE_BINARY, Type::STRUCT, Type::LIST, @@ -190,7 +196,9 @@ std::string ToString(Type::type id) { TO_STRING_CASE(INTERVAL_MONTHS) TO_STRING_CASE(DURATION) TO_STRING_CASE(STRING) + TO_STRING_CASE(STRING_VIEW) TO_STRING_CASE(BINARY) + TO_STRING_CASE(BINARY_VIEW) TO_STRING_CASE(LARGE_STRING) TO_STRING_CASE(LARGE_BINARY) TO_STRING_CASE(FIXED_SIZE_BINARY) @@ -593,10 +601,18 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } +std::string BinaryViewType::ToString() const { + return raw_pointers_ ? "binary_view[RAW POINTERS]" : "binary_view"; +} + std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } +std::string StringViewType::ToString() const { + return raw_pointers_ ? "string_view[RAW POINTERS]" : "string_view"; +} + std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -2320,8 +2336,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat) PARAMETER_LESS_FINGERPRINT(Float) PARAMETER_LESS_FINGERPRINT(Double) PARAMETER_LESS_FINGERPRINT(Binary) +PARAMETER_LESS_FINGERPRINT(BinaryView) PARAMETER_LESS_FINGERPRINT(LargeBinary) PARAMETER_LESS_FINGERPRINT(String) +PARAMETER_LESS_FINGERPRINT(StringView) PARAMETER_LESS_FINGERPRINT(LargeString) PARAMETER_LESS_FINGERPRINT(Date32) PARAMETER_LESS_FINGERPRINT(Date64) @@ -2533,6 +2551,18 @@ TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) +const std::shared_ptr& utf8_view(bool has_raw_pointers) { + static std::shared_ptr io = std::make_shared(); + static std::shared_ptr raw = std::make_shared(true); + return has_raw_pointers ? raw : io; +} + +const std::shared_ptr& binary_view(bool has_raw_pointers) { + static std::shared_ptr io = std::make_shared(); + static std::shared_ptr raw = std::make_shared(true); + return has_raw_pointers ? raw : io; +} + std::shared_ptr fixed_size_binary(int32_t byte_width) { return std::make_shared(byte_width); } @@ -2782,7 +2812,7 @@ void InitStaticData() { // * Time32 // * Time64 // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; + g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()}; Extend(g_numeric_types, &g_primitive_types); Extend(g_base_binary_types, &g_primitive_types); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 48228d43ef9..a77168cf43d 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" #include "arrow/util/macros.h" +#include "arrow/util/string_header.h" #include "arrow/util/visibility.h" #include "arrow/visitor.h" // IWYU pragma: keep @@ -113,8 +115,14 @@ struct ARROW_EXPORT DataTypeLayout { std::vector buffers; /// Whether this type expects an associated dictionary array. bool has_dictionary = false; + /// If this is provided, the number of buffers expected is only lower-bounded by + /// buffers.size(). Buffers beyond this lower bound are expected to conform to + /// variadic_spec. + std::optional variadic_spec; - explicit DataTypeLayout(std::vector v) : buffers(std::move(v)) {} + explicit DataTypeLayout(std::vector buffers, + std::optional variadic_spec = {}) + : buffers(std::move(buffers)), variadic_spec(variadic_spec) {} }; /// \brief Base class for all data types @@ -710,6 +718,40 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} }; +/// \brief Concrete type class for variable-size binary view data using +/// StringHeader structs +class ARROW_EXPORT BinaryViewType : public DataType { + public: + static constexpr Type::type type_id = Type::BINARY_VIEW; + static constexpr bool is_utf8 = false; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "binary_view"; } + + explicit BinaryViewType(bool has_raw_pointers = false) + : BinaryViewType(Type::BINARY_VIEW, has_raw_pointers) {} + + DataTypeLayout layout() const override { + return DataTypeLayout( + {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))}, + DataTypeLayout::VariableWidth()); + } + + std::string ToString() const override; + std::string name() const override { return "binary_view"; } + + bool has_raw_pointers() const { return raw_pointers_; } + + protected: + std::string ComputeFingerprint() const override; + + // Allow subclasses like StringType to change the logical type. + explicit BinaryViewType(Type::type logical_type, bool has_raw_pointers) + : DataType(logical_type), raw_pointers_(has_raw_pointers) {} + + bool raw_pointers_ = false; +}; + /// \brief Concrete type class for large variable-size binary data class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { public: @@ -756,6 +798,25 @@ class ARROW_EXPORT StringType : public BinaryType { std::string ComputeFingerprint() const override; }; +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringViewType : public BinaryViewType { + public: + static constexpr Type::type type_id = Type::STRING_VIEW; + static constexpr bool is_utf8 = true; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "utf8_view"; } + + explicit StringViewType(bool has_raw_pointers = false) + : BinaryViewType(Type::STRING_VIEW, has_raw_pointers) {} + + std::string ToString() const override; + std::string name() const override { return "utf8_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for large variable-size string data, utf8-encoded class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 657abbaecc4..c0569fc091f 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -108,6 +108,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class BinaryViewType; +class BinaryViewArray; +class BinaryViewBuilder; +struct BinaryViewScalar; + class LargeBinaryType; class LargeBinaryArray; class LargeBinaryBuilder; @@ -123,6 +128,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StringViewType; +class StringViewArray; +class StringViewBuilder; +struct StringViewScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -413,6 +423,13 @@ struct Type { /// Run-end encoded data. RUN_END_ENCODED, + /// String (UTF8) view type with 4-byte prefix and inline small string + /// optimization + STRING_VIEW, + + /// Bytes view type with 4-byte prefix and inline small string optimization + BINARY_VIEW, + // Leave this at the end MAX_ID }; @@ -454,10 +471,14 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +/// \brief Return a StringViewType instance +ARROW_EXPORT const std::shared_ptr& utf8_view(bool has_raw_pointers = false); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr& binary(); +/// \brief Return a BinaryViewType instance +ARROW_EXPORT const std::shared_ptr& binary_view(bool has_raw_pointers = false); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 3c83da9f2e6..93e7e6a5f9d 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1045,9 +1045,21 @@ TEST(TestBinaryType, ToString) { TEST(TestStringType, ToString) { StringType str; ASSERT_EQ(str.id(), Type::STRING); + ASSERT_EQ(str.name(), std::string("utf8")); + ASSERT_EQ(str.type_name(), std::string("utf8")); ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestBinaryViewType, ToString) { + BinaryViewType t1; + BinaryViewType e1; + StringViewType t2; + AssertTypeEqual(t1, e1); + AssertTypeNotEqual(t1, t2); + ASSERT_EQ(t1.id(), Type::BINARY_VIEW); + ASSERT_EQ(t1.ToString(), std::string("binary_view")); +} + TEST(TestLargeBinaryTypes, ToString) { BinaryType bt1; LargeBinaryType t1; diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index ac16afe4b8c..de328f322ad 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -88,6 +88,8 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::DURATION: case Type::INTERVAL_MONTH_DAY_NANO: // Stored as two 32-bit integers and a 64-bit // integer + case Type::STRING_VIEW: + case Type::BINARY_VIEW: return 8; case Type::DICTIONARY: case Type::EXTENSION: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 7204fd6d85d..02426a33fb2 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -341,6 +341,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return binary(); } }; +template <> +struct TypeTraits { + using ArrayType = BinaryViewArray; + using BuilderType = BinaryViewBuilder; + using ScalarType = BinaryViewScalar; + using CType = StringHeader; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return binary_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeBinaryArray; @@ -371,6 +381,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = StringViewArray; + using BuilderType = StringViewBuilder; + using ScalarType = StringViewScalar; + using CType = StringHeader; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return utf8_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeStringArray; @@ -399,6 +419,11 @@ struct CTypeTraits : public TypeTraits { using ArrowType = StringType; }; +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = BinaryViewType; +}; + template <> struct CTypeTraits : public CTypeTraits {}; @@ -614,9 +639,28 @@ using is_string_type = template using enable_if_string = enable_if_t::value, R>; +template +using is_binary_view_like_type = std::is_base_of; + +template +using is_binary_view_type = std::is_same; + +template +using is_string_view_type = std::is_same; + +template +using enable_if_binary_view_like = enable_if_t::value, R>; + +template +using enable_if_binary_view = enable_if_t::value, R>; + +template +using enable_if_string_view = enable_if_t::value, R>; + template using is_string_like_type = - std::integral_constant::value && T::is_utf8>; + std::integral_constant::value && T::is_utf8) || + is_string_view_type::value>; template using enable_if_string_like = enable_if_t::value, R>; @@ -639,10 +683,9 @@ template using enable_if_fixed_width_type = enable_if_t::value, R>; template -using is_binary_like_type = - std::integral_constant::value && - !is_string_like_type::value) || - is_fixed_size_binary_type::value>; +using is_binary_like_type = std::integral_constant< + bool, (is_base_binary_type::value && !is_string_like_type::value) || + is_binary_view_type::value || is_fixed_size_binary_type::value>; template using enable_if_binary_like = enable_if_t::value, R>; @@ -801,8 +844,10 @@ using enable_if_has_c_type = enable_if_t::value, R>; template using has_string_view = std::integral_constant::value || + std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || std::is_same::value>; diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 4d364655787..719cad42aa4 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -111,3 +111,4 @@ add_arrow_benchmark(thread_pool_benchmark) add_arrow_benchmark(trie_benchmark) add_arrow_benchmark(utf8_util_benchmark) add_arrow_benchmark(value_parsing_benchmark) +add_arrow_benchmark(string_conversion_benchmark) diff --git a/cpp/src/arrow/util/range.h b/cpp/src/arrow/util/range.h index ea0fb0eeaab..d80979c766a 100644 --- a/cpp/src/arrow/util/range.h +++ b/cpp/src/arrow/util/range.h @@ -21,11 +21,11 @@ #include #include #include +#include #include #include -namespace arrow { -namespace internal { +namespace arrow::internal { /// Create a vector containing the values from start up to stop template @@ -151,5 +151,55 @@ LazyRange MakeLazyRange(Generator&& gen, int64_t length) { return LazyRange(std::forward(gen), length); } -} // namespace internal -} // namespace arrow +/// \brief A helper for iterating multiple ranges simultaneously, modelled after python's +/// built-in zip() function. +/// +/// \code {.cpp} +/// const std::vector& tables = ... +/// std::function()> GetNames = ... +/// for (auto&& [table, name] : Zip(tables, GetNames())) { +/// static_assert(std::is_same_v); +/// static_assert(std::is_same_v); +/// // temporaries (like this vector of strings) are kept alive for the +/// // duration of a loop and are safely movable). +/// RegisterTableWithName(std::move(name), &table); +/// } +/// \endcode +template +struct Zip; + +template +Zip(Ranges&&...) -> Zip, std::index_sequence_for>; + +template +struct Zip, std::index_sequence> { + explicit Zip(Ranges... ranges) : ranges_(std::forward(ranges)...) {} + + std::tuple ranges_; + + using sentinel = std::tuple(ranges_).end())...>; + + struct iterator : std::tuple(ranges_).begin())...> { + using std::tuple(ranges_).begin())...>::tuple; + + constexpr auto operator*() { + return std::tuple(*this))...>{*std::get(*this)...}; + } + + constexpr iterator& operator++() { + (++std::get(*this), ...); + return *this; + } + + constexpr bool operator!=(const sentinel& s) const { + bool all_iterators_valid = (... && (std::get(*this) != std::get(s))); + return all_iterators_valid; + } + }; + + constexpr iterator begin() { return {std::get(ranges_).begin()...}; } + + constexpr sentinel end() { return {std::get(ranges_).end()...}; } +}; + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/sort.h b/cpp/src/arrow/util/sort.h index cdffe0b2317..6b01998ab59 100644 --- a/cpp/src/arrow/util/sort.h +++ b/cpp/src/arrow/util/sort.h @@ -24,18 +24,24 @@ #include #include -namespace arrow { -namespace internal { +#include "arrow/util/span.h" -template > -std::vector ArgSort(const std::vector& values, Cmp&& cmp = {}) { - std::vector indices(values.size()); +namespace arrow::internal { + +template > +std::vector ArgSort(arrow::util::span values, Cmp&& cmp = {}) { + std::vector indices(values.size()); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), - [&](int64_t i, int64_t j) -> bool { return cmp(values[i], values[j]); }); + [&](I i, I j) -> bool { return cmp(values[i], values[j]); }); return indices; } +template +std::vector ArgSort(const Range& values, Cmp&&... cmp) { + return ArgSort(arrow::util::span{values}, std::forward(cmp)...); +} + template size_t Permute(const std::vector& indices, std::vector* values) { if (indices.size() <= 1) { @@ -74,5 +80,4 @@ size_t Permute(const std::vector& indices, std::vector* values) { return cycle_count; } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/span.h b/cpp/src/arrow/util/span.h new file mode 100644 index 00000000000..79ac3890e29 --- /dev/null +++ b/cpp/src/arrow/util/span.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace arrow::util { + +/// std::span polyfill +template +class span { + static_assert(sizeof(T), + R"( +std::span allows contiguous_iterators instead of just pointers, the enforcement +of which requires T to be a complete type. arrow::util::span does not support +contiguous_iterators, but T is still required to be a complete type to prevent +writing code which would break when it is replaced by std::span.)"); + + public: + using element_type = T; + + span() = default; + span(const span&) = default; + span& operator=(const span&) = default; + + template >> + // NOLINTNEXTLINE runtime/explicit + constexpr span(span mut) : span{mut.data(), mut.size()} {} + + constexpr span(T* data, std::size_t count) : data_{data}, size_{count} {} + + constexpr span(T* begin, T* end) + : data_{begin}, size_{static_cast(end - begin)} {} + + template ().data()), T*>>, + typename = std::enable_if_t< + std::is_convertible_v().size()), std::size_t>>> + // NOLINTNEXTLINE runtime/explicit, non-const reference + constexpr span(R& range) : span{range.data(), range.size()} {} + + constexpr T* begin() const { return data_; } + constexpr T* end() const { return data_ + size_; } + constexpr T* data() const { return data_; } + + constexpr std::size_t size() const { return size_; } + constexpr std::size_t size_bytes() const { return size_ * sizeof(T); } + constexpr bool empty() const { return size_ == 0; } + + constexpr T& operator[](std::size_t i) { return data_[i]; } + constexpr const T& operator[](std::size_t i) const { return data_[i]; } + + constexpr span subspan(std::size_t offset) const { + if (offset > size_) return {}; + return {data_ + offset, size_ - offset}; + } + + constexpr span subspan(std::size_t offset, std::size_t count) const { + auto out = subspan(offset); + if (count < out.size_) { + out.size_ = count; + } + return out; + } + + constexpr bool operator==(span const& other) const { + if (size_ != other.size_) return false; + + T* ptr = data_; + for (T const& e : other) { + if (*ptr++ != e) return false; + } + return true; + } + constexpr bool operator!=(span const& other) const { return !(*this == other); } + + private: + T* data_{}; + std::size_t size_{}; +}; + +template +span(R&) -> span().data())>>; + +template +constexpr span as_bytes(span s) { + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +template +constexpr span as_writable_bytes(span s) { + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index 2055b4f47ea..07dfc2ce79f 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -90,6 +90,16 @@ Status ParseHexValue(const char* data, uint8_t* out) { return Status::OK(); } +Status ParseHexValues(std::string_view hex_string, uint8_t* out) { + if (hex_string.size() % 2 != 0) { + return Status::Invalid("Expected base16 hex string"); + } + for (size_t j = 0; j < hex_string.size() / 2; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_string.data() + j * 2, out + j)); + } + return Status::OK(); +} + namespace internal { std::vector SplitString(std::string_view v, char delimiter, diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d9777efc56a..d7e377773f6 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -46,7 +46,9 @@ ARROW_EXPORT std::string HexEncode(std::string_view str); ARROW_EXPORT std::string Escape(std::string_view str); -ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out); +ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out); + +ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { diff --git a/cpp/src/arrow/util/string_conversion_benchmark.cc b/cpp/src/arrow/util/string_conversion_benchmark.cc new file mode 100644 index 00000000000..28a412a516f --- /dev/null +++ b/cpp/src/arrow/util/string_conversion_benchmark.cc @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/unreachable.h" +#include "arrow/visit_data_inline.h" +#include "benchmark/benchmark.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array/array_binary.h" +#include "arrow/array/builder_binary.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/util/formatting.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/value_parsing.h" + +namespace arrow::internal { +namespace { + +// Matrix of benchmarks: +// +// Direction: +// - STRING <-> RAW VIEWS +// - STRING <-> IO VIEWS +// - IO VIEWS <-> RAW VIEWS +// +// View length: +// - pure inline +// - pure non-inline +// - mixed with small mean length +// - mixed with large mean length +// +// Character buffer count: +// - ensure there are multiple 1MB buffers +// - baseline with only a single character buffer? +constexpr int kCharacterCount = (1 << 20) * 16; + +// Null counts? +// Scrambled ordering? + +enum { + kStrings, + kRawPointerViews, + kIndexOffsetViews, +}; +std::shared_ptr DataTypeFor(decltype(kStrings) enm) { + switch (enm) { + case kStrings: + return utf8(); + case kIndexOffsetViews: + return utf8_view(); + case kRawPointerViews: + return utf8_view(/*has_raw_pointers=*/true); + } + Unreachable(); +} + +enum { + kAlwaysInlineable, + kUsuallyInlineable, + kShortButNeverInlineable, + kLongAndSeldomInlineable, + kLongAndNeverInlineable, +}; + +StringViewArray ToStringViewArray(const StringArray& arr) { + auto header_buffer = AllocateBuffer(arr.length() * sizeof(StringHeader)).ValueOrDie(); + + StringHeadersFromStrings(*arr.data(), header_buffer->mutable_data_as()); + + return {arr.length(), + std::move(header_buffer), + {arr.value_data()}, + arr.null_bitmap(), + arr.null_count()}; +} + +StringArray ToStringArray(const StringViewArray& arr) { + int64_t char_count = 0; + for (size_t i = 2; i < arr.data()->buffers.size(); ++i) { + char_count += arr.data()->buffers[i]->size(); + } + + auto offset_buffer = AllocateBuffer((arr.length() + 1) * sizeof(int32_t)).ValueOrDie(); + auto* offset = offset_buffer->mutable_data_as(); + offset[0] = 0; + + BufferBuilder char_buffer_builder; + ABORT_NOT_OK(char_buffer_builder.Reserve(char_count)); + + ABORT_NOT_OK(VisitArraySpanInline( + *arr.data(), + [&](std::string_view v) { + offset[1] = offset[0] + v.size(); + ++offset; + return char_buffer_builder.Append(v); + }, + [&] { + offset[1] = offset[0]; + ++offset; + return Status::OK(); + })); + + auto char_buffer = char_buffer_builder.Finish().ValueOrDie(); + + return {arr.length(), std::move(offset_buffer), std::move(char_buffer), + arr.null_bitmap(), arr.null_count()}; +} + +std::shared_ptr ToRawPointers(const StringViewArray& io) { + auto raw_buf = AllocateBuffer(io.length() * sizeof(StringHeader)).ValueOrDie(); + auto st = + SwapStringHeaderPointers(*io.data(), raw_buf->mutable_data_as()); + ABORT_NOT_OK(st); + return raw_buf; +} + +std::shared_ptr ToIndexOffsets(const StringViewArray& raw) { + auto io_buf = AllocateBuffer(raw.length() * sizeof(StringHeader)).ValueOrDie(); + auto st = + SwapStringHeaderPointers(*raw.data(), io_buf->mutable_data_as()); + ABORT_NOT_OK(st); + return io_buf; +} + +template +static void ConvertViews(benchmark::State& state) { // NOLINT non-const reference + auto [min_length, max_length] = [] { + switch (StringLengthsAre) { + case kAlwaysInlineable: + return std::pair{0, 12}; + case kUsuallyInlineable: + return std::pair{0, 16}; + case kShortButNeverInlineable: + return std::pair{13, 30}; + case kLongAndSeldomInlineable: + return std::pair{0, 256}; + case kLongAndNeverInlineable: + return std::pair{13, 256}; + default: + Unreachable(); + } + }(); + + auto num_items = kCharacterCount / max_length; + + auto from_type = DataTypeFor(From); + auto to_type = DataTypeFor(To); + + auto from = random::GenerateArray(*field("", from_type, + key_value_metadata({ + {"null_probability", "0"}, + {"min_length", std::to_string(min_length)}, + {"max_length", std::to_string(max_length)}, + })), + num_items, 0xdeadbeef); + + uint64_t dummy = 0; + for (auto _ : state) { + if constexpr (From == kStrings && To == kIndexOffsetViews) { + dummy += ToStringViewArray(checked_cast(*from)).length(); + } + + if constexpr (From == kIndexOffsetViews && To == kStrings) { + dummy += ToStringArray(checked_cast(*from)).length(); + } + + if constexpr (From == kIndexOffsetViews && To == kRawPointerViews) { + dummy += ToRawPointers(checked_cast(*from))->size(); + } + + if constexpr (From == kRawPointerViews && To == kIndexOffsetViews) { + dummy += ToIndexOffsets(checked_cast(*from))->size(); + } + + benchmark::DoNotOptimize(dummy); + } + state.SetItemsProcessed(state.iterations() * num_items); +} + +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kLongAndNeverInlineable); + +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kLongAndNeverInlineable); + +/* +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kLongAndNeverInlineable); + +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kLongAndNeverInlineable); + */ + +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, + kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, + kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, + kLongAndNeverInlineable); + +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, + kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, + kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, + kLongAndNeverInlineable); + +} // namespace +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/string_header.cc b/cpp/src/arrow/util/string_header.cc new file mode 100644 index 00000000000..e12d4553b16 --- /dev/null +++ b/cpp/src/arrow/util/string_header.cc @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/util/string_header.h" + +namespace arrow { + +std::ostream& operator<<(std::ostream& os, const StringHeader& header) { + os.write(header.data(), header.size()); + return os; +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h new file mode 100644 index 00000000000..9341bcbc062 --- /dev/null +++ b/cpp/src/arrow/util/string_header.h @@ -0,0 +1,377 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace arrow { + +/// Variable length string or binary with 4 byte prefix and inline optimization +/// for small values (12 bytes or fewer). This is similar to std::string_view +/// except that the referenced is limited in size to UINT32_MAX and up to the +/// first four bytes of the string are copied into the struct. The prefix allows +/// failing comparisons early and can reduce the CPU cache working set when +/// dealing with short strings. +/// +/// This structure supports three states: +/// +/// Short string |----|----|--------| +/// ^ ^ ^ +/// | | | +/// size prefix remaining in-line portion, zero padded +/// +/// Long string |----|----|--------| +/// ^ ^ ^ +/// | | | +/// size prefix raw pointer to out-of-line portion +/// +/// IO Long string |----|----|----|----| +/// ^ ^ ^ ^ +/// | | | `----------. +/// size prefix buffer index and offset to out-of-line portion +/// +/// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. +/// +/// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +/// +/// There is no way to determine from a non-inline StringHeader whether it refers +/// to its out-of-line portion with a raw pointer or with index/offset. This +/// information is stored at the column level; so a buffer of StringHeader will +/// contain only one or the other. In general unless a StringHeader is resident +/// in a StringView array's buffer it will refer to out-of-line data with a raw +/// pointer. This default is assumed by several members of StringHeader such as +/// operator==() and operator string_view() since these and other operations cannot +/// be performed on index/offset StringHeaders without also accessing the buffers +/// storing their out-of-line data. Which states pertain to each accessor and +/// constructor are listed in their comments. +struct alignas(8) StringHeader { + public: + using value_type = char; + + static constexpr size_t kTotalSize = 16; + static constexpr size_t kSizeSize = sizeof(uint32_t); + static constexpr size_t kIndexOffsetSize = sizeof(uint32_t) * 2; + static constexpr size_t kPrefixSize = kTotalSize - kSizeSize - kIndexOffsetSize; + static_assert(kPrefixSize == 4); + static constexpr size_t kInlineSize = kTotalSize - kPrefixSize; + static_assert(kInlineSize == 12); + + /// Construct an empty view. + StringHeader() = default; + + /// Construct a RAW POINTER view. + StringHeader(const char* data, size_t len) : size_(static_cast(len)) { + if (size_ == 0) return; + + // TODO(bkietz) better option than assert? + assert(data); + if (IsInline()) { + // small string: inlined. Bytes beyond size_ are already 0 + memcpy(prefix_.data(), data, size_); + } else { + // large string: store pointer + memcpy(prefix_.data(), data, kPrefixSize); + value_.data = data; + } + } + + /// Construct a RAW POINTER view. + StringHeader(const uint8_t* data, int64_t len) + : StringHeader(reinterpret_cast(data), static_cast(len)) {} + + /// Convenience implicit constructor for RAW POINTER views from C string/string literal. + /// + /// NOLINTNEXTLINE runtime/explicit + StringHeader(const char* data) : StringHeader(data, std::strlen(data)) {} + + /// Construct a RAW POINTER view. + explicit StringHeader(const std::string& value) + : StringHeader(value.data(), value.size()) {} + + /// Construct a RAW POINTER view. + explicit StringHeader(std::string_view value) + : StringHeader(value.data(), value.size()) {} + + /// Construct an INDEX/OFFSET view. + StringHeader(const char* data, uint32_t len, uint32_t buffer_index, + const char* buffer_data) + : size_(len) { + if (size_ == 0) return; + + // TODO(bkietz) better option than assert? + assert(data); + if (IsInline()) { + // small string: inlined. Bytes beyond size_ are already 0 + memcpy(prefix_.data(), data, size_); + } else { + // large string: store index/offset + memcpy(prefix_.data(), data, kPrefixSize); + SetIndexOffset(buffer_index, static_cast(data - buffer_data)); + } + } + + /// Construct an INDEX/OFFSET view. + StringHeader(uint32_t len, std::array prefix, uint32_t buffer_index, + uint32_t offset) + : size_(len), prefix_(prefix) { + SetIndexOffset(buffer_index, offset); + } + + template + static constexpr bool IsInline(I size) { + return size <= static_cast(kInlineSize); + } + static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; } + + /// True if the view's data is entirely stored inline. + /// This function is safe for use against both RAW POINTER and INDEX/OFFSET views. + bool IsInline() const { return IsInline(size_); } + + /// Return a RAW POINTER view's data. + const char* data() const& { return IsInline() ? prefix_.data() : value_.data; } + const char* data() && = delete; + + /// The number of characters viewed by this StringHeader. + /// This function is safe for use against both RAW POINTER and INDEX/OFFSET views. + size_t size() const { return size_; } + + /// Print a RAW POINTER view to a std::ostream. + friend std::ostream& operator<<(std::ostream& os, const StringHeader& header); + + /// Equality comparison between RAW POINTER views. + bool operator==(const StringHeader& other) const { + // Compare lengths and first 4 characters. + if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) { + return false; + } + if (IsInline()) { + // The inline part is zeroed at construction, so we can compare + // a word at a time if data extends past 'prefix_'. + return size_ <= kPrefixSize || InlinedAsInt64() == other.InlinedAsInt64(); + } + // Sizes are equal and this is not inline, therefore both are out + // of line and have kPrefixSize first in common. + return memcmp(value_.data + kPrefixSize, other.value_.data + kPrefixSize, + size_ - kPrefixSize) == 0; + } + + /// Inequality comparison between RAW POINTER views. + bool operator!=(const StringHeader& other) const { return !(*this == other); } + + /// Less-than comparison between RAW POINTER views. + bool operator<(const StringHeader& other) const { return Compare(other) < 0; } + + /// Less-than-or-equal comparison between RAW POINTER views. + bool operator<=(const StringHeader& other) const { return Compare(other) <= 0; } + + /// Greater-than comparison between RAW POINTER views. + bool operator>(const StringHeader& other) const { return Compare(other) > 0; } + + /// Greater-than-or-equal comparison between RAW POINTER views. + bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; } + + /// Conversion to std::string_view for RAW POINTER views. + explicit operator std::string_view() const& { return {data(), size()}; } + explicit operator std::string_view() && = delete; + + /// Return the always-inline cached first 4 bytes of this StringHeader. + /// This function is safe for use against both RAW POINTER and INDEX/OFFSET views. + std::array GetPrefix() const { return prefix_; } + + /// Return an INDEX/OFFSET view's buffer index. + uint32_t GetBufferIndex() const { return value_.io_data.buffer_index; } + + /// Return an INDEX/OFFSET view's buffer offset. + uint32_t GetBufferOffset() const { return value_.io_data.offset; } + + /// Return a RAW POINTER view's data pointer. + /// + /// NOT VALID FOR INLINE VIEWS. + const char* GetRawPointer() const { return value_.data; } + + /// Return an INDEX/OFFSET view's data pointer. + /// + /// NOT VALID FOR INLINE VIEWS. + template + const char* GetPointerFromBuffers(const BufferPtr* char_buffers) const { + return char_buffers[GetBufferIndex()]->template data_as() + GetBufferOffset(); + } + + /// Return an INDEX/OFFSET view's data pointer. + template + const char* GetPointerFromBuffersOrInlineData(const BufferPtr* char_buffers) const { + return IsInline() ? GetInlineData() : GetPointerFromBuffers(char_buffers); + } + + /// Return a the inline data of a view. + /// + /// For inline views, this points to the entire data of the view. + /// For other views, this points to the 4 byte prefix. + const char* GetInlineData() const& { return prefix_.data(); } + const char* GetInlineData() && = delete; + + /// Mutate into a RAW POINTER view. + /// + /// This function is only intended for use in converting from an equivalent INDEX/OFFSET + /// view; in particular it does not check or modify the prefix for consistency with the + /// new data pointer. + void SetRawPointer(const char* data) { value_.data = data; } + + /// Mutate into an INDEX/OFFSET view. + /// + /// This function is only intended for use in converting from an equivalent RAW POINTER + /// view; in particular it does not check or modify the prefix for consistency with the + /// new buffer index/offset. + void SetIndexOffset(uint32_t buffer_index, uint32_t offset) { + value_.io_data = {buffer_index, offset}; + } + + /// Equality compare an INDEX/OFFSET view in place. + /// + /// Equivalent comparison will be accomplished by (for example) first converting both + /// views to std::string_view and comparing those, but this would not take advantage + /// of the cached 4 byte prefix. + template + bool EqualsIndexOffset(const BufferPtr* char_buffers, const StringHeader& other, + const BufferPtr* other_char_buffers) const { + if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) { + return false; + } + if (IsInline()) { + return InlinedAsInt64() == other.InlinedAsInt64(); + } + // Sizes are equal and this is not inline, therefore both are out of line and we + // have already checked that their kPrefixSize first characters are equal. + return memcmp(GetPointerFromBuffers(char_buffers) + kPrefixSize, + other.GetPointerFromBuffers(other_char_buffers) + kPrefixSize, + size() - kPrefixSize) == 0; + } + + /// Less-than compare an INDEX/OFFSET view in place. + /// + /// Equivalent comparison will be accomplished by (for example) first converting both + /// views to std::string_view and comparing those, but this would not take advantage + /// of the cached 4 byte prefix. + template + bool LessThanIndexOffset(const BufferPtr* char_buffers, const StringHeader& other, + const BufferPtr* other_char_buffers) const { + return CompareIndexOffset(char_buffers, other, other_char_buffers) < 0; + } + + private: + // Returns 0, if this == other + // < 0, if this < other + // > 0, if this > other + int32_t Compare(const StringHeader& other) const { + if (PrefixAsInt() != other.PrefixAsInt()) { + // The result is decided on prefix. The shorter will be less + // because the prefix is padded with zeros. + return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize); + } + int32_t size = std::min(size_, other.size_) - kPrefixSize; + if (size <= 0) { + // One string is just the prefix. + return size_ - other.size_; + } + if (static_cast(size) <= kInlineSize && IsInline() && other.IsInline()) { + int32_t result = memcmp(value_.inlined.data(), other.value_.inlined.data(), size); + return (result != 0) ? result : size_ - other.size_; + } + int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size); + return (result != 0) ? result : size_ - other.size_; + } + + template + int CompareIndexOffset(const BufferPtr* char_buffers, const StringHeader& other, + const BufferPtr* other_char_buffers) const { + if (PrefixAsInt() != other.PrefixAsInt()) { + // The result is decided on prefix. The shorter will be less + // because the prefix is padded with zeros. + return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize); + } + int32_t size = std::min(size_, other.size_) - kPrefixSize; + if (size <= 0) { + // One string is just the prefix. + return size_ - other.size_; + } + if (static_cast(size) <= kInlineSize && IsInline() && other.IsInline()) { + int32_t result = memcmp(value_.inlined.data(), other.value_.inlined.data(), size); + return (result != 0) ? result : size_ - other.size_; + } + + int32_t result = memcmp( + GetPointerFromBuffersOrInlineData(char_buffers) + kPrefixSize, + other.GetPointerFromBuffersOrInlineData(other_char_buffers) + kPrefixSize, size); + return (result != 0) ? result : size_ - other.size_; + } + + int64_t SizeAndPrefixAsInt64() const { + return reinterpret_cast(this)[0]; + } + + int64_t InlinedAsInt64() const { return reinterpret_cast(this)[1]; } + + int32_t PrefixAsInt() const { return *reinterpret_cast(&prefix_); } + + // FIXME(bkietz) replace this with a std::array and forgo the union. + // Type punning (AKA violation of the strict aliasing rule) is undefined behavior. + // Using memcpy to access the bytes of the object representation of trivially copyable + // objects is not undefined behavior. Given sufficiently explicit hints on alignment + // and size, compilers elide memcpy calls in favor of identical assembly to what + // the type punning implementation produces. + // We rely on all members being laid out top to bottom . C++ + // guarantees this. + uint32_t size_ = 0; + std::array prefix_ = {0}; + union { + std::array inlined = {0}; + const char* data; + struct { + uint32_t buffer_index; + uint32_t offset; + } io_data; + } value_; +}; + +static_assert(sizeof(StringHeader) == 16, "struct size expected to be exactly 16 bytes"); +static_assert(alignof(StringHeader) == 8, + "struct alignment expected to be exactly 8 bytes"); + +} // namespace arrow diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 6a9b32d73a6..a6793210a89 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -144,6 +144,67 @@ struct ArraySpanInlineVisitor> { } }; +// BinaryView, StringView... +template +struct ArraySpanInlineVisitor> { + using c_type = std::string_view; + + static std::string_view GetView(const StringHeader& s, + const std::shared_ptr* char_buffers) { + if (!s.IsInline()) { + const auto& buffer = char_buffers[s.GetBufferIndex()]; + return std::string_view{buffer->data_as() + s.GetBufferOffset(), s.size()}; + } + return std::string_view{s.GetInlineData(), s.size()}; + } + + static const std::shared_ptr* GetCharBuffers(const ArraySpan& arr) { + return reinterpret_cast*>(arr.buffers[2].data); + } + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return Status::OK(); + } + auto* s = arr.GetValues(1); + if (checked_cast(arr.type)->has_raw_pointers()) { + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { return valid_func(std::string_view{s[index]}); }, + [&]() { return null_func(); }); + } else { + auto* char_buffers = GetCharBuffers(arr); + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { return valid_func(GetView(s[index], char_buffers)); }, + [&]() { return null_func(); }); + } + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return; + } + auto* s = arr.GetValues(1); + if (checked_cast(arr.type)->has_raw_pointers()) { + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(std::string_view{s[index]}); }, + std::forward(null_func)); + } else { + auto* char_buffers = GetCharBuffers(arr); + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(GetView(s[index], char_buffers)); }, + std::forward(null_func)); + } + } +}; + // FixedSizeBinary, Decimal128 template struct ArraySpanInlineVisitor> { @@ -273,9 +334,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index ed3d5bc2c68..e057f6b12fb 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array) ARRAY_VISITOR_DEFAULT(HalfFloatArray) ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) -ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(StringViewArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(BinaryViewArray) ARRAY_VISITOR_DEFAULT(LargeBinaryArray) ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) @@ -96,7 +98,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType) TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(StringViewType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(BinaryViewType) TYPE_VISITOR_DEFAULT(LargeStringType) TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) @@ -147,7 +151,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar) SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) +SCALAR_VISITOR_DEFAULT(StringViewScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(BinaryViewScalar) SCALAR_VISITOR_DEFAULT(LargeStringScalar) SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index b22d4d3c567..650b0e7ee0a 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const FloatArray& array); virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); + virtual Status Visit(const StringViewArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const BinaryViewArray& array); virtual Status Visit(const LargeStringArray& array); virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); @@ -94,7 +96,9 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const FloatType& type); virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); + virtual Status Visit(const StringViewType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const BinaryViewType& type); virtual Status Visit(const LargeStringType& type); virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); @@ -143,7 +147,9 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const FloatScalar& scalar); virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const StringViewScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const BinaryViewScalar& scalar); virtual Status Visit(const LargeStringScalar& scalar); virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 8f6b176ba8f..4b57abe53ff 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -40,7 +40,9 @@ namespace arrow { ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ + ACTION(StringView); \ ACTION(Binary); \ + ACTION(BinaryView); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ diff --git a/cpp/src/gandiva/gdv_hash_function_stubs.cc b/cpp/src/gandiva/gdv_hash_function_stubs.cc index 018b0fbb709..710b3ffb757 100644 --- a/cpp/src/gandiva/gdv_hash_function_stubs.cc +++ b/cpp/src/gandiva/gdv_hash_function_stubs.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//#pragma once +// #pragma once #include "gandiva/engine.h" #include "gandiva/exported_funcs.h" diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc b/cpp/src/gandiva/gdv_string_function_stubs.cc index cf04de3a8e1..069b972f9e6 100644 --- a/cpp/src/gandiva/gdv_string_function_stubs.cc +++ b/cpp/src/gandiva/gdv_string_function_stubs.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//#pragma once +// #pragma once #include "gandiva/gdv_function_stubs.h" diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index b84c51b3a6b..82c62c8851d 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -101,15 +101,15 @@ TEST(TestStringOps, TestChrBigInt) { out = chr_int64(ctx_ptr, -66, &out_len); EXPECT_EQ(std::string(out, out_len), "\xBE"); - //€ + // € out = chr_int32(ctx_ptr, 128, &out_len); EXPECT_EQ(std::string(out, out_len), "\x80"); - //œ + // œ out = chr_int64(ctx_ptr, 156, &out_len); EXPECT_EQ(std::string(out, out_len), "\x9C"); - //ÿ + // ÿ out = chr_int32(ctx_ptr, 255, &out_len); EXPECT_EQ(std::string(out, out_len), "\xFF"); diff --git a/cpp/src/generated/File_generated.h b/cpp/src/generated/File_generated.h index 5b219f1eb0e..06953c4a040 100644 --- a/cpp/src/generated/File_generated.h +++ b/cpp/src/generated/File_generated.h @@ -26,18 +26,15 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Block FLATBUFFERS_FINAL_CLASS { int64_t bodyLength_; public: - Block() - : offset_(0), - metaDataLength_(0), - padding0__(0), - bodyLength_(0) { - (void)padding0__; + Block() { + memset(static_cast(this), 0, sizeof(Block)); } Block(int64_t _offset, int32_t _metaDataLength, int64_t _bodyLength) : offset_(flatbuffers::EndianScalar(_offset)), metaDataLength_(flatbuffers::EndianScalar(_metaDataLength)), padding0__(0), bodyLength_(flatbuffers::EndianScalar(_bodyLength)) { + (void)padding0__; } /// Index to the start of the RecordBlock (note this is past the Message header) int64_t offset() const { @@ -122,6 +119,7 @@ struct FooterBuilder { : fbb_(_fbb) { start_ = fbb_.StartTable(); } + FooterBuilder &operator=(const FooterBuilder &); flatbuffers::Offset