From 6d490efbb203339a23aab3499ec3f68ab3a0799d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 9 Sep 2022 16:35:27 -0500 Subject: [PATCH 01/38] Draft basic scaffolding for Binary/StringView types and get compiling --- LICENSE.txt | 16 +- cpp/src/arrow/array/array_base.cc | 4 + cpp/src/arrow/array/array_binary.cc | 12 + cpp/src/arrow/array/array_binary.h | 58 ++++ cpp/src/arrow/array/builder_binary.cc | 86 ++++++ cpp/src/arrow/array/builder_binary.h | 248 ++++++++++++++++++ cpp/src/arrow/array/builder_dict.cc | 6 + cpp/src/arrow/array/builder_dict.h | 10 + cpp/src/arrow/array/concatenate.cc | 4 + cpp/src/arrow/array/util.cc | 13 + cpp/src/arrow/array/validate.cc | 20 +- cpp/src/arrow/compare.cc | 13 +- cpp/src/arrow/ipc/feather.cc | 4 +- cpp/src/arrow/ipc/metadata_internal.cc | 10 + cpp/src/arrow/ipc/reader.cc | 5 + cpp/src/arrow/ipc/writer.cc | 4 + cpp/src/arrow/json/test_common.h | 10 +- cpp/src/arrow/scalar.cc | 14 + cpp/src/arrow/scalar.h | 29 ++ cpp/src/arrow/testing/json_internal.cc | 11 +- cpp/src/arrow/type.cc | 16 +- cpp/src/arrow/type.h | 46 ++++ cpp/src/arrow/type_fwd.h | 21 ++ cpp/src/arrow/type_test.cc | 12 + cpp/src/arrow/type_traits.h | 57 +++- cpp/src/arrow/util/string_header.h | 219 ++++++++++++++++ cpp/src/arrow/visitor.cc | 8 +- cpp/src/arrow/visitor.h | 6 + cpp/src/arrow/visitor_generate.h | 2 + cpp/src/parquet/column_writer.cc | 1 + .../src/arrow/python/arrow_to_pandas.cc | 38 +-- .../src/arrow/python/python_to_arrow.cc | 23 +- 32 files changed, 975 insertions(+), 51 deletions(-) create mode 100644 cpp/src/arrow/util/string_header.h diff --git a/LICENSE.txt b/LICENSE.txt index c2b0a996fed..d355854e2c8 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1894,7 +1894,7 @@ This project includes code from the autobrew project. The following files are based on code from the autobrew project: * r/tools/autobrew * dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb -* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb +* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb Copyright (c) 2019, Jeroen Ooms License: MIT @@ -1976,6 +1976,20 @@ License: http://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- +This project includes code from Velox. + + * cpp/src/arrow/util/bytes_header.h + +is based on Velox's + + * velox/type/StringView.h + +Copyright: Copyright (c) Facebook, Inc. and its affiliates. +Home page: https://github.com/facebookincubator/velox +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + The file cpp/src/arrow/vendored/musl/strptime.c has the following license Copyright © 2005-2020 Rich Felker, et al. diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index f7b8d7954e1..5e84d928256 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -87,6 +87,10 @@ struct ScalarFromArraySlotImpl { return Finish(a.GetString(index_)); } + Status Visit(const BinaryViewArray& a) { + return Status::NotImplemented("ScalarFromArraySlot -> BinaryView"); + } + Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); } diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 9466b5a48f9..cfc467160a6 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -89,6 +89,18 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } +BinaryViewArray::BinaryViewArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); + SetData(data); +} + +StringViewArray::StringViewArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); + SetData(data); +} + +Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } + FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 7e58a96ff84..03ee77fab8b 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -217,6 +218,63 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { Status ValidateUTF8() const; }; +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// StringHeader struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + + explicit BinaryViewArray(const std::shared_ptr& data); + + BinaryViewArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : PrimitiveArray(binary_view(), length, data, null_bitmap, null_count, offset) {} + + const StringHeader* raw_values() const { + return reinterpret_cast(raw_values_) + data_->offset; + } + + StringHeader Value(int64_t i) const { return raw_values()[i]; } + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); } + + // EXPERIMENTAL + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using PrimitiveArray::PrimitiveArray; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// StringHeader to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(const std::shared_ptr& data); + + StringViewArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : BinaryViewArray(utf8_view(), length, data, null_bitmap, null_count, offset) {} + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 571f450aab9..e0a7bc1193a 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -40,6 +40,92 @@ namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// Binary/StringView + +Status BinaryViewBuilder::AppendValues(const std::vector& values, + const uint8_t* valid_bytes) { + // We only need to allocate memory for the out-of-line strings + std::size_t out_of_line_total = std::accumulate( + values.begin(), values.end(), 0ULL, [](uint64_t sum, const std::string& str) { + size_t length = str.size(); + return sum + (length > StringHeader::kInlineSize ? length : 0); + }); + RETURN_NOT_OK(Reserve(values.size())); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + + if (valid_bytes != nullptr) { + for (std::size_t i = 0; i < values.size(); ++i) { + if (valid_bytes[i]) { + UnsafeAppend(values[i]); + } else { + UnsafeAppendNull(); + } + } + } else { + for (std::size_t i = 0; i < values.size(); ++i) { + UnsafeAppend(values[i]); + } + } + UnsafeAppendToBitmap(valid_bytes, values.size()); + return Status::OK(); +} + +Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) { + auto bitmap = array.GetValues(0, 0); + auto values = array.GetValues(1) + offset; + + int64_t out_of_line_total = 0; + for (int64_t i = 0; i < length; i++) { + if (!values[i].IsInline()) { + out_of_line_total += static_cast(values[i].size()); + } + } + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) { + if (values[i].IsInline()) { + UnsafeAppend(values[i]); + } else { + UnsafeAppend(values[i].data(), values[i].size()); + } + } else { + UnsafeAppendNull(); + } + } + return Status::OK(); +} + +Status BinaryViewBuilder::FinishInternal(std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + BufferVector buffers = {null_bitmap, data}; + for (auto&& buffer : data_heap_builder_.Finish()) { + buffers.push_back(std::move(buffer)); + } + *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); + capacity_ = length_ = null_count_ = 0; + Reset(); + return Status::OK(); +} + +Status BinaryViewBuilder::ReserveData(int64_t length) { + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + return data_heap_builder_.Reserve(length); +} + +void BinaryViewBuilder::Reset() { + ArrayBuilder::Reset(); + data_builder_.Reset(); + data_heap_builder_.Reset(); +} + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index b0c4fe2fc81..6412516b971 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -463,6 +463,254 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { std::shared_ptr type() const override { return large_utf8(); } }; +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// The builders permit two styles of use: one where appended data is +// accumulated in a third buffer that is appended to the resulting ArrayData, +// and one where only the StringHeaders are appended. If you only want to +// append StringHeaders, then use the Append(const StringHeader&) methods + +namespace internal { + +// Because we construct StringHeader objects incrementally, resizing buffers is +// not an option as memory addresses for out-of-line strings will change. Thus, +// we allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 1 << 20; // 1MB + + StringHeapBuilder(MemoryPool* pool, int64_t blocksize = kDefaultBlocksize) + : pool_(pool), blocksize_(blocksize) {} + + const uint8_t* UnsafeAppend(const uint8_t* data, int64_t num_bytes) { + memcpy(current_out_buffer_, data, static_cast(num_bytes)); + const uint8_t* result = current_out_buffer_; + current_out_buffer_ += num_bytes; + current_remaining_bytes_ -= num_bytes; + return result; + } + + Result Append(const uint8_t* data, int64_t num_bytes) { + if (num_bytes > current_remaining_bytes_) { + ARROW_RETURN_NOT_OK(Reserve(num_bytes)); + } + return UnsafeAppend(data, num_bytes); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (num_bytes > current_remaining_bytes_) { + current_remaining_bytes_ = + num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, + AllocateBuffer(current_remaining_bytes_, pool_)); + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_out_buffer_ = nullptr; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + std::vector> Finish() { + current_out_buffer_ = nullptr; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + MemoryPool* pool_; + const int64_t blocksize_; + std::vector> blocks_; + + uint8_t* current_out_buffer_ = nullptr; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + BinaryViewBuilder(const std::shared_ptr& type, MemoryPool* pool) + : BinaryViewBuilder(pool) {} + + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + if (length > static_cast(StringHeader::kInlineSize)) { + // String is stored out-of-line + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); + } + // Overwrite 'value' since we will use that for the StringHeader value below + ARROW_ASSIGN_OR_RAISE(value, data_heap_builder_.Append(value, length)); + } + UnsafeAppend(StringHeader(value, length)); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + Status Append(StringHeader value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(value); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 4GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + if (length > static_cast(StringHeader::kInlineSize)) { + // String is stored out-of-line + // Overwrite 'value' since we will use that for the StringHeader value below + value = data_heap_builder_.UnsafeAppend(value, length); + } + UnsafeAppend(StringHeader(value, length)); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + void UnsafeAppend(StringHeader value) { + data_builder_.UnsafeAppend(value); + UnsafeAppendToBitmap(true); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, StringHeader()); // zero + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(StringHeader()); // zero + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(StringHeader("")); // zero + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, StringHeader("")); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(StringHeader()); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(StringHeader("")); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool()) + : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {} + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + // ---------------------------------------------------------------------- // FixedSizeBinaryBuilder diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 061fb600412..c99a6faceeb 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -193,6 +193,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu return impl_->GetOrInsert(value, out); } +Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value, + int32_t* out) { + // Create BinaryArray dictionary for now + return impl_->GetOrInsert(value, out); +} + Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf30991..0cc82930a14 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -60,6 +60,12 @@ struct DictionaryValue> { BinaryType, LargeBinaryType>::type; }; +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + template struct DictionaryValue> { using type = std::string_view; @@ -115,6 +121,10 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + // TODO: Consider working StringHeader throughout the hashing machinery to + // benefit from faster comparisons, reduced need to allocate memory + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); + class DictionaryMemoTableImpl; std::unique_ptr impl_; }; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 65c82384369..f2abaed4a46 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -229,6 +229,10 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } + Status Visit(const BinaryViewType&) { + return Status::NotImplemented("binary / string view"); + } + Status Visit(const ListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 07be8176fc0..0135fbb0049 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -267,6 +267,14 @@ class ArrayDataEndianSwapper { return Status::OK(); } + template + enable_if_t::value || + std::is_same::value, + Status> + Visit(const T& type) { + return Status::NotImplemented("Binary / string view"); + } + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); @@ -643,6 +651,11 @@ class RepeatedArrayFactory { return Status::OK(); } + template + enable_if_binary_view_like Visit(const T&) { + return Status::NotImplemented("binary / string view"); + } + template enable_if_var_size_list Visit(const T& type) { using ScalarType = typename TypeTraits::ScalarType; diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 0f2bd458357..7c08d8da38e 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -48,6 +48,19 @@ struct UTF8DataValidator { return Status::NotImplemented(""); } + Status Visit(const StringViewType&) { + util::InitializeUTF8(); + + const auto* values = data.GetValues(1); + for (int64_t i = 0; i < data.length; ++i) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8( + reinterpret_cast(values[i].data()), values[i].size()))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + } + return Status::OK(); + } + template enable_if_string Visit(const StringType&) { util::InitializeUTF8(); @@ -248,6 +261,10 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } + Status Visit(const BinaryViewType& type) { + return Status::NotImplemented("binary / string view"); + } + Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } @@ -796,7 +813,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING); + DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || + data.type->id() == Type::LARGE_STRING); UTF8DataValidator validator{data}; return VisitTypeInline(*data.type, &validator); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index df41cd22c9e..99d87a2d1b6 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -261,6 +261,11 @@ class RangeDataEqualsImpl { // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } + // Also matches StringViewType + Status Visit(const BinaryViewType& type) { + return Status::NotImplemented("Binary / string view"); + } + // Also matches LargeStringType Status Visit(const LargeBinaryType& type) { return CompareBinary(type); } @@ -625,7 +630,7 @@ class TypeEqualsVisitor { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value, + is_base_binary_type::value || is_binary_view_like_type::value, Status> Visit(const T&) { result_ = true; @@ -809,6 +814,12 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const BinaryViewScalar& left) { + const auto& right = checked_cast(right_); + result_ = left.value == right.value; + return Status::OK(); + } + Status Visit(const Decimal128Scalar& left) { const auto& right = checked_cast(right_); result_ = left.value == right.value; diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b6d3a3d7d8c..1ef076fac40 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -536,8 +536,8 @@ struct ArrayWriterV1 { is_nested_type::value || is_null_type::value || is_decimal_type::value || std::is_same::value || is_duration_type::value || is_interval_type::value || is_fixed_size_binary_type::value || - std::is_same::value || std::is_same::value || - std::is_same::value, + is_binary_view_like_type::value || std::is_same::value || + std::is_same::value || std::is_same::value, Status>::type Visit(const T& type) { return Status::NotImplemented(type.ToString()); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 1394516ecd5..255bff2241d 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -534,6 +534,16 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + // BinaryView will be written to IPC as a normal binary array + return Visit(BinaryType()); + } + + Status Visit(const StringViewType& type) { + // StringView will be written to IPC as a normal UTF8 string array + return Visit(StringType()); + } + Status Visit(const LargeBinaryType& type) { fb_type_ = flatbuf::Type::LargeBinary; type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 694cc732253..4577a416523 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -348,6 +348,11 @@ class ArrayLoader { return LoadBinary(type.id()); } + Status Visit(const BinaryViewType& type) { + DCHECK(false); + return Status::NotImplemented("Reading IPC format to binary view is not supported"); + } + Status Visit(const FixedSizeBinaryType& type) { out_->buffers.resize(2); RETURN_NOT_OK(LoadCommon(type.id())); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index f0f0e96ee46..1b7fb74cb9d 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -425,6 +425,10 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + return Status::NotImplemented("Binary / string view type"); + } + Status Visit(const FixedSizeListArray& array) { --max_recursion_depth_; auto size = array.list_type()->list_size(); diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 0f7b3466fdb..f7ab6fd1027 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,8 +110,7 @@ struct GenerateImpl { return OK(writer.Double(val)); } - template - enable_if_base_binary Visit(const T&) { + Status GenerateAscii(const DataType&) { auto size = std::poisson_distribution<>{4}(e); std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 std::string s(size, '\0'); @@ -119,6 +118,13 @@ struct GenerateImpl { return OK(writer.String(s.c_str())); } + template + enable_if_base_binary Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + template enable_if_list_like Visit(const T& t) { auto size = std::poisson_distribution<>{4}(e); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 0537ddafe29..622c767a443 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -71,6 +71,12 @@ struct ScalarHashImpl { Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); } + Status Visit(const BinaryViewScalar& s) { + const StringHeader& v = s.value; + hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size()); + return Status::OK(); + } + template Status Visit(const TemporalScalar& s) { return ValueHash(s); @@ -263,6 +269,14 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } + Status Visit(const BinaryViewScalar& s) { + return Status::NotImplemented("Binary view"); + } + + Status Visit(const StringViewScalar& s) { + return Status::NotImplemented("String view"); + } + Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index d23b33e28f7..c1c08571a6a 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -37,6 +37,7 @@ #include "arrow/type_traits.h" #include "arrow/util/compare.h" #include "arrow/util/decimal.h" +#include "arrow/util/string_header.h" #include "arrow/util/visibility.h" #include "arrow/visit_type_inline.h" @@ -286,6 +287,34 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { StringScalar() : StringScalar(utf8()) {} }; +struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase { + using internal::PrimitiveScalarBase::PrimitiveScalarBase; + using TypeClass = BinaryViewType; + + explicit BinaryViewScalar(StringHeader value, std::shared_ptr type) + : internal::PrimitiveScalarBase(std::move(type), true), value(value) {} + + explicit BinaryViewScalar(StringHeader value) + : BinaryViewScalar(value, binary_view()) {} + + BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {} + + void* mutable_data() override { return reinterpret_cast(&this->value); } + + std::string_view view() const override { return std::string_view(this->value); } + + StringHeader value; +}; + +struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using TypeClass = StringViewType; + + explicit StringViewScalar(StringHeader value) + : BinaryViewScalar(std::move(value), utf8_view()) {} + + StringViewScalar() : BinaryViewScalar(utf8_view()) {} +}; + struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc index babff621b1f..45db2346d28 100644 --- a/cpp/src/arrow/testing/json_internal.cc +++ b/cpp/src/arrow/testing/json_internal.cc @@ -227,8 +227,9 @@ class SchemaWriter { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value || is_base_list_type::value || - is_struct_type::value || is_run_end_encoded_type::value> + is_base_binary_type::value || is_binary_view_like_type::value || + is_base_list_type::value || is_struct_type::value || + is_run_end_encoded_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -386,6 +387,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8_view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binary_view", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -1367,6 +1370,10 @@ class ArrayReader { return FinishBuilder(&builder); } + Status Visit(const BinaryViewType& type) { + return Status::NotImplemented("Binary / string view"); + } + Status Visit(const DayTimeIntervalType& type) { DayTimeIntervalBuilder builder(pool_); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 68dc2aabe96..5676cd90e07 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -60,10 +60,14 @@ constexpr Type::type FixedSizeListType::type_id; constexpr Type::type BinaryType::type_id; +constexpr Type::type BinaryViewType::type_id; + constexpr Type::type LargeBinaryType::type_id; constexpr Type::type StringType::type_id; +constexpr Type::type StringViewType::type_id; + constexpr Type::type LargeStringType::type_id; constexpr Type::type FixedSizeBinaryType::type_id; @@ -190,7 +194,9 @@ std::string ToString(Type::type id) { TO_STRING_CASE(INTERVAL_MONTHS) TO_STRING_CASE(DURATION) TO_STRING_CASE(STRING) + TO_STRING_CASE(STRING_VIEW) TO_STRING_CASE(BINARY) + TO_STRING_CASE(BINARY_VIEW) TO_STRING_CASE(LARGE_STRING) TO_STRING_CASE(LARGE_BINARY) TO_STRING_CASE(FIXED_SIZE_BINARY) @@ -593,10 +599,14 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } +std::string BinaryViewType::ToString() const { return "binary_view"; } + std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } +std::string StringViewType::ToString() const { return "string_view"; } + std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -2320,8 +2330,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat) PARAMETER_LESS_FINGERPRINT(Float) PARAMETER_LESS_FINGERPRINT(Double) PARAMETER_LESS_FINGERPRINT(Binary) +PARAMETER_LESS_FINGERPRINT(BinaryView) PARAMETER_LESS_FINGERPRINT(LargeBinary) PARAMETER_LESS_FINGERPRINT(String) +PARAMETER_LESS_FINGERPRINT(StringView) PARAMETER_LESS_FINGERPRINT(LargeString) PARAMETER_LESS_FINGERPRINT(Date32) PARAMETER_LESS_FINGERPRINT(Date64) @@ -2527,8 +2539,10 @@ TYPE_FACTORY(float16, HalfFloatType) TYPE_FACTORY(float32, FloatType) TYPE_FACTORY(float64, DoubleType) TYPE_FACTORY(utf8, StringType) +TYPE_FACTORY(utf8_view, StringViewType) TYPE_FACTORY(large_utf8, LargeStringType) TYPE_FACTORY(binary, BinaryType) +TYPE_FACTORY(binary_view, BinaryViewType) TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) @@ -2782,7 +2796,7 @@ void InitStaticData() { // * Time32 // * Time64 // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; + g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()}; Extend(g_numeric_types, &g_primitive_types); Extend(g_base_binary_types, &g_primitive_types); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 48228d43ef9..fb9b80d9f34 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -33,6 +33,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" #include "arrow/util/macros.h" +#include "arrow/util/string_header.h" #include "arrow/util/visibility.h" #include "arrow/visitor.h" // IWYU pragma: keep @@ -710,6 +711,33 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} }; +/// \brief Concrete type class for variable-size binary view data using +/// StringHeader structs +class ARROW_EXPORT BinaryViewType : public DataType { + public: + static constexpr Type::type type_id = Type::BINARY_VIEW; + static constexpr bool is_utf8 = false; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "binary_view"; } + + BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {} + + DataTypeLayout layout() const override { + return DataTypeLayout( + {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))}); + } + + std::string ToString() const override; + std::string name() const override { return "binary_view"; } + + protected: + std::string ComputeFingerprint() const override; + + // Allow subclasses like StringType to change the logical type. + explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {} +}; + /// \brief Concrete type class for large variable-size binary data class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { public: @@ -756,6 +784,24 @@ class ARROW_EXPORT StringType : public BinaryType { std::string ComputeFingerprint() const override; }; +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringViewType : public BinaryViewType { + public: + static constexpr Type::type type_id = Type::STRING_VIEW; + static constexpr bool is_utf8 = true; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "utf8_view"; } + + StringViewType() : BinaryViewType(Type::STRING_VIEW) {} + + std::string ToString() const override; + std::string name() const override { return "utf8_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for large variable-size string data, utf8-encoded class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 657abbaecc4..64f837f84aa 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -108,6 +108,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class BinaryViewType; +class BinaryViewArray; +class BinaryViewBuilder; +struct BinaryViewScalar; + class LargeBinaryType; class LargeBinaryArray; class LargeBinaryBuilder; @@ -123,6 +128,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StringViewType; +class StringViewArray; +class StringViewBuilder; +struct StringViewScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -413,6 +423,13 @@ struct Type { /// Run-end encoded data. RUN_END_ENCODED, + /// String (UTF8) view type with 4-byte prefix and inline small string + /// optimization + STRING_VIEW, + + /// Bytes view type with 4-byte prefix and inline small string optimization + BINARY_VIEW, + // Leave this at the end MAX_ID }; @@ -454,10 +471,14 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +/// \brief Return a StringViewType instance +ARROW_EXPORT const std::shared_ptr& utf8_view(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr& binary(); +/// \brief Return a BinaryViewType instance +ARROW_EXPORT const std::shared_ptr& binary_view(); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 3c83da9f2e6..93e7e6a5f9d 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1045,9 +1045,21 @@ TEST(TestBinaryType, ToString) { TEST(TestStringType, ToString) { StringType str; ASSERT_EQ(str.id(), Type::STRING); + ASSERT_EQ(str.name(), std::string("utf8")); + ASSERT_EQ(str.type_name(), std::string("utf8")); ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestBinaryViewType, ToString) { + BinaryViewType t1; + BinaryViewType e1; + StringViewType t2; + AssertTypeEqual(t1, e1); + AssertTypeNotEqual(t1, t2); + ASSERT_EQ(t1.id(), Type::BINARY_VIEW); + ASSERT_EQ(t1.ToString(), std::string("binary_view")); +} + TEST(TestLargeBinaryTypes, ToString) { BinaryType bt1; LargeBinaryType t1; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 7204fd6d85d..9d47493d889 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -341,6 +341,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return binary(); } }; +template <> +struct TypeTraits { + using ArrayType = BinaryViewArray; + using BuilderType = BinaryViewBuilder; + using ScalarType = BinaryViewScalar; + using CType = StringHeader; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return binary_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeBinaryArray; @@ -371,6 +381,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = StringViewArray; + using BuilderType = StringViewBuilder; + using ScalarType = StringViewScalar; + using CType = StringHeader; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return utf8_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeStringArray; @@ -399,6 +419,11 @@ struct CTypeTraits : public TypeTraits { using ArrowType = StringType; }; +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = BinaryViewType; +}; + template <> struct CTypeTraits : public CTypeTraits {}; @@ -614,9 +639,28 @@ using is_string_type = template using enable_if_string = enable_if_t::value, R>; +template +using is_binary_view_like_type = std::is_base_of; + +template +using is_binary_view_type = std::is_same; + +template +using is_string_view_type = std::is_same; + +template +using enable_if_binary_view_like = enable_if_t::value, R>; + +template +using enable_if_binary_view = enable_if_t::value, R>; + +template +using enable_if_string_view = enable_if_t::value, R>; + template using is_string_like_type = - std::integral_constant::value && T::is_utf8>; + std::integral_constant::value && T::is_utf8) || + is_string_view_type::value>; template using enable_if_string_like = enable_if_t::value, R>; @@ -639,10 +683,9 @@ template using enable_if_fixed_width_type = enable_if_t::value, R>; template -using is_binary_like_type = - std::integral_constant::value && - !is_string_like_type::value) || - is_fixed_size_binary_type::value>; +using is_binary_like_type = std::integral_constant< + bool, (is_base_binary_type::value && !is_string_like_type::value) || + is_binary_view_type::value || is_fixed_size_binary_type::value>; template using enable_if_binary_like = enable_if_t::value, R>; @@ -801,8 +844,10 @@ using enable_if_has_c_type = enable_if_t::value, R>; template using has_string_view = std::integral_constant::value || - std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || std::is_same::value>; diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h new file mode 100644 index 00000000000..29f378a580a --- /dev/null +++ b/cpp/src/arrow/util/string_header.h @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace arrow { + +// Variable length string or binary with 4 byte prefix and inline optimization +// for small values (12 bytes or fewer). This is similar to std::string_view +// except that the referenced is limited in size to UINT32_MAX and up to the +// first four bytes of the string are copied into the struct. The prefix allows +// failing comparisons early and can reduce the CPU cache working set when +// dealing with short strings. +// +// Short string |----|----|--------| +// ^ ^ ^ +// | | | +// size prefix remaining in-line portion +// +// Long string |----|----|--------| +// ^ ^ ^ +// | | | +// size prefix pointer to out-of-line portion +// +// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. +// +// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +struct StringHeader { + public: + using value_type = char; + + static constexpr size_t kPrefixSize = 4; + static constexpr size_t kInlineSize = 12; + + StringHeader() { + static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes"); + ; + memset(this, 0, sizeof(StringHeader)); + } + + explicit StringHeader(uint32_t size) : size_(size) { + memset(prefix_, 0, kPrefixSize); + value_.data = nullptr; + } + + StringHeader(const char* data, size_t len) : size_(len) { + // TODO: better option than assert? + assert(data || size_ == 0); + if (IsInline()) { + // Zero the inline part. + // this makes sure that inline strings can be compared for equality with 2 + // int64 compares. + memset(prefix_, 0, kPrefixSize); + if (size_ == 0) { + return; + } + // small string: inlined. Zero the last 8 bytes first to allow for whole + // word comparison. + value_.data = nullptr; + memcpy(prefix_, data, size_); + } else { + // large string: store pointer + memcpy(prefix_, data, kPrefixSize); + value_.data = data; + } + } + + StringHeader(const uint8_t* data, int64_t len) + : StringHeader(reinterpret_cast(data), static_cast(len)) {} + + // Making StringHeader implicitly constructible/convertible from char* and + // string literals, in order to allow for a more flexible API and optional + // interoperability. E.g: + // + // StringHeader bh = "literal"; + // std::optional obh = "literal"; + // + /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {} + + explicit StringHeader(const std::string& value) + : StringHeader(value.data(), value.size()) {} + + explicit StringHeader(const std::string_view& value) + : StringHeader(value.data(), value.size()) {} + + bool IsInline() const { return IsInline(size_); } + + static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; } + + const char* data() const { return IsInline() ? prefix_ : value_.data; } + + size_t size() const { return size_; } + + size_t capacity() const { return size_; } + + friend std::ostream& operator<<(std::ostream& os, const StringHeader& header) { + os.write(header.data(), header.size()); + return os; + } + + bool operator==(const StringHeader& other) const { + // Compare lengths and first 4 characters. + if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) { + return false; + } + if (IsInline()) { + // The inline part is zeroed at construction, so we can compare + // a word at a time if data extends past 'prefix_'. + return size_ <= kPrefixSize || InlinedAsInt64() == other.InlinedAsInt64(); + } + // Sizes are equal and this is not inline, therefore both are out + // of line and have kPrefixSize first in common. + return memcmp(value_.data + kPrefixSize, other.value_.data + kPrefixSize, + size_ - kPrefixSize) == 0; + } + + bool operator!=(const StringHeader& other) const { return !(*this == other); } + + // Returns 0, if this == other + // < 0, if this < other + // > 0, if this > other + int32_t Compare(const StringHeader& other) const { + if (PrefixAsInt() != other.PrefixAsInt()) { + // The result is decided on prefix. The shorter will be less + // because the prefix is padded with zeros. + return memcmp(prefix_, other.prefix_, kPrefixSize); + } + int32_t size = std::min(size_, other.size_) - kPrefixSize; + if (size <= 0) { + // One ends within the prefix. + return size_ - other.size_; + } + if (static_cast(size) <= kInlineSize && IsInline() && other.IsInline()) { + int32_t result = memcmp(value_.inlined, other.value_.inlined, size); + return (result != 0) ? result : size_ - other.size_; + } + int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size); + return (result != 0) ? result : size_ - other.size_; + } + + bool operator<(const StringHeader& other) const { return Compare(other) < 0; } + + bool operator<=(const StringHeader& other) const { return Compare(other) <= 0; } + + bool operator>(const StringHeader& other) const { return Compare(other) > 0; } + + bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; } + + operator std::string() const { return std::string(data(), size()); } + + std::string GetString() const { return *this; } + + explicit operator std::string_view() const { return std::string_view(data(), size()); } + + const char* begin() const { return data(); } + + const char* end() const { return data() + size(); } + + bool empty() const { return size() == 0; } + + private: + inline int64_t SizeAndPrefixAsInt64() const { + return reinterpret_cast(this)[0]; + } + + inline int64_t InlinedAsInt64() const { + return reinterpret_cast(this)[1]; + } + + int32_t PrefixAsInt() const { return *reinterpret_cast(&prefix_); } + + // We rely on all members being laid out top to bottom . C++ + // guarantees this. + uint32_t size_; + char prefix_[4]; + union { + char inlined[8]; + const char* data; + } value_; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index ed3d5bc2c68..e057f6b12fb 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array) ARRAY_VISITOR_DEFAULT(HalfFloatArray) ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) -ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(StringViewArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(BinaryViewArray) ARRAY_VISITOR_DEFAULT(LargeBinaryArray) ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) @@ -96,7 +98,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType) TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(StringViewType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(BinaryViewType) TYPE_VISITOR_DEFAULT(LargeStringType) TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) @@ -147,7 +151,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar) SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) +SCALAR_VISITOR_DEFAULT(StringViewScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(BinaryViewScalar) SCALAR_VISITOR_DEFAULT(LargeStringScalar) SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index b22d4d3c567..650b0e7ee0a 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const FloatArray& array); virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); + virtual Status Visit(const StringViewArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const BinaryViewArray& array); virtual Status Visit(const LargeStringArray& array); virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); @@ -94,7 +96,9 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const FloatType& type); virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); + virtual Status Visit(const StringViewType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const BinaryViewType& type); virtual Status Visit(const LargeStringType& type); virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); @@ -143,7 +147,9 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const FloatScalar& scalar); virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const StringViewScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const BinaryViewScalar& scalar); virtual Status Visit(const LargeStringScalar& scalar); virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 8f6b176ba8f..4b57abe53ff 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -40,7 +40,9 @@ namespace arrow { ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ + ACTION(StringView); \ ACTION(Binary); \ + ACTION(BinaryView); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 33e9f8f6658..dc803df312a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -134,6 +134,7 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Dictionary); NOT_IMPLEMENTED_VISIT(RunEndEncoded); NOT_IMPLEMENTED_VISIT(Extension); + NOT_IMPLEMENTED_VISIT(BinaryView); #undef NOT_IMPLEMENTED_VISIT diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index f6b7ca9d54a..a142a6eaca0 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -116,39 +116,21 @@ void BufferCapsule_Destructor(PyObject* capsule) { using internal::arrow_traits; using internal::npy_traits; -template +template struct WrapBytes {}; -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyUnicode_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes { +template +struct WrapBytes::value || + is_string_view_type::value>> { static inline PyObject* Wrap(const char* data, int64_t length) { return PyUnicode_FromStringAndSize(data, length); } }; -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyBytes_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes { - static inline PyObject* Wrap(const char* data, int64_t length) { - return PyBytes_FromStringAndSize(data, length); - } -}; - -template <> -struct WrapBytes { +template +struct WrapBytes::value || + is_binary_view_type::value || + is_fixed_size_binary_type::value>> { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } @@ -1150,7 +1132,9 @@ struct ObjectWriterVisitor { } template - enable_if_t::value || is_fixed_size_binary_type::value, + enable_if_t::value || + is_binary_view_like_type::value || + is_fixed_size_binary_type::value, Status> Visit(const Type& type) { auto WrapValue = [](const std::string_view& view, PyObject** out) { diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 4f7420d8295..b7dac78b401 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -479,13 +479,17 @@ class PyValue { // The binary-like intermediate representation is PyBytesView because it keeps temporary // python objects alive (non-contiguous memoryview) and stores whether the original - // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // object was unicode encoded or not, which is used for unicode -> bytes coercion if // there is a non-unicode object observed. static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { return view.ParseString(obj); } + static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); + } + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { ARROW_RETURN_NOT_OK(view.ParseString(obj)); @@ -672,12 +676,9 @@ class PyPrimitiveConverter:: PyBytesView view_; }; -template -class PyPrimitiveConverter> - : public PrimitiveConverter { +template +class PyBinaryConverter : public PrimitiveConverter { public: - using OffsetType = typename T::offset_type; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { this->primitive_builder_->UnsafeAppendNull(); @@ -701,7 +702,7 @@ class PyPrimitiveConverter> Result> ToArray() override { ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); if (observed_binary_) { - // if we saw any non-unicode, cast results to BinaryArray + // if we saw any non-unicode, cast results to BinaryArray/BinaryViewArray auto binary_type = TypeTraits::type_singleton(); return array->View(binary_type); } else { @@ -714,6 +715,14 @@ class PyPrimitiveConverter> bool observed_binary_ = false; }; +template +class PyPrimitiveConverter> + : public PyBinaryConverter {}; + +template +class PyPrimitiveConverter> + : public PyBinaryConverter {}; + template class PyDictionaryConverter> : public DictionaryConverter { From 21688eb386cee41dfcffea977cd2c610c80bd67d Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 18 Oct 2022 17:14:16 +0200 Subject: [PATCH 02/38] BinaryViewBuilder: fix duplicate values in null bitmap --- cpp/src/arrow/array/builder_binary.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 6412516b971..3b4c0921f4d 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -567,7 +567,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { ARROW_ASSIGN_OR_RAISE(value, data_heap_builder_.Append(value, length)); } UnsafeAppend(StringHeader(value, length)); - UnsafeAppendToBitmap(true); return Status::OK(); } From 6b6cd959ad11ff0bd54d5cd3c659cb03f5f76b38 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 18 Oct 2022 17:16:57 +0200 Subject: [PATCH 03/38] enable JSON converter for StringView/BinaryView --- cpp/src/arrow/ipc/json_simple.cc | 4 ++++ cpp/src/arrow/json/converter.cc | 2 ++ 2 files changed, 6 insertions(+) diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index eea0c973028..4d2d803f3f6 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -847,6 +847,8 @@ Status GetDictConverter(const std::shared_ptr& type, PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) + PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, FixedSizeBinaryType) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) @@ -905,6 +907,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 04ebe4714ce..c393b77acf3 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -304,6 +304,8 @@ Status MakeConverter(const std::shared_ptr& out_type, MemoryPool* pool CONVERTER_CASE(Type::STRING, BinaryConverter); CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter); CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter); + CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter); + CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter); CONVERTER_CASE(Type::DECIMAL128, DecimalConverter); CONVERTER_CASE(Type::DECIMAL256, DecimalConverter); default: From 52eb44655e4be017489ce2b288e66d8f208b38eb Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 18 Oct 2022 17:18:02 +0200 Subject: [PATCH 04/38] add StringView/BinaryView to AllTypeIds --- cpp/src/arrow/type.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5676cd90e07..1c0a7544cb8 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -130,6 +130,8 @@ std::vector AllTypeIds() { Type::BINARY, Type::LARGE_STRING, Type::LARGE_BINARY, + Type::STRING_VIEW, + Type::BINARY_VIEW, Type::FIXED_SIZE_BINARY, Type::STRUCT, Type::LIST, From 8a75259fa8c4fd0c1e7763135b5751dc7ae10d04 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 18 Oct 2022 17:21:14 +0200 Subject: [PATCH 05/38] implement inline visitor for StringView/BinaryView --- cpp/src/arrow/visit_data_inline.h | 53 +++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 6a9b32d73a6..116a031757a 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -144,6 +144,59 @@ struct ArraySpanInlineVisitor> { } }; +// BinaryView, StringView... +template +struct ArraySpanInlineVisitor> { + using c_type = std::string_view; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return Status::OK(); + } + const StringHeader* headers; + if (arr.buffers[1].data == NULLPTR) { + headers = NULLPTR; + } else { + // Do not apply the array offset to the values array; the value_offsets + // index the non-sliced values array. + headers = arr.GetValues(1); + } + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t (index)) { + return valid_func(static_cast(headers[index])); + }, + [&]() { + return null_func(); + }); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return; + } + const StringHeader* headers; + if (arr.buffers[1].data == NULLPTR) { + headers = NULLPTR; + } else { + // Do not apply the array offset to the values array; the value_offsets + // index the non-sliced values array. + headers = arr.GetValues(1); + } + + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t (index)) { + valid_func(static_cast(headers[index])); + }, + std::forward(null_func)); + } +}; + // FixedSizeBinary, Decimal128 template struct ArraySpanInlineVisitor> { From 1d81aea55c5b67ee97e82def2af327f9d3a44ac0 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 18 Oct 2022 17:24:45 +0200 Subject: [PATCH 06/38] fix formatting --- cpp/src/arrow/visit_data_inline.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 116a031757a..41f1730b339 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -165,12 +165,10 @@ struct ArraySpanInlineVisitor> { } return VisitBitBlocks( arr.buffers[0].data, arr.offset, arr.length, - [&](int64_t (index)) { + [&](int64_t(index)) { return valid_func(static_cast(headers[index])); }, - [&]() { - return null_func(); - }); + [&]() { return null_func(); }); } template @@ -190,7 +188,7 @@ struct ArraySpanInlineVisitor> { VisitBitBlocksVoid( arr.buffers[0].data, arr.offset, arr.length, - [&](int64_t (index)) { + [&](int64_t(index)) { valid_func(static_cast(headers[index])); }, std::forward(null_func)); From 3931e251dd51e822d701045708dc035c6c4f28bc Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 18 Oct 2022 17:25:06 +0200 Subject: [PATCH 07/38] fix formatting --- cpp/src/arrow/type_traits.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 9d47493d889..02426a33fb2 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -844,8 +844,8 @@ using enable_if_has_c_type = enable_if_t::value, R>; template using has_string_view = std::integral_constant::value || - std::is_same::value || - std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || From 7fe8e2d392cd72b54acff841d65980a3841d7f9b Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 18 Oct 2022 17:29:16 +0200 Subject: [PATCH 08/38] run binary data visitor tests on StringView/BinaryView --- cpp/src/arrow/array/array_binary_test.cc | 13 ++++++++++--- cpp/src/arrow/testing/gtest_util.h | 3 +++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 3bc9bb91a02..c9f1b1cfab0 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -883,11 +883,15 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { void SetUp() override { type_ = TypeTraits::type_singleton(); } void TestBasics() { - auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])"); + auto array = ArrayFromJSON( + type_, + R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])"); BinaryAppender appender; ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); - ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"})); + ASSERT_THAT(appender.data, + ::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me", + "allocate_me_aaaaa", "allocate_me_bbbb"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning } @@ -904,7 +908,10 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes); +using BinaryAndBin = ::testing::Types; + +TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 13fc0b3e81d..332b4d0df88 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -176,6 +176,9 @@ using DecimalArrowTypes = ::testing::Types; using BaseBinaryArrowTypes = ::testing::Types; +using BaseBinaryOrBinaryViewLikeArrowTypes = + ::testing::Types; + using BinaryArrowTypes = ::testing::Types; using StringArrowTypes = ::testing::Types; From 6624acc4eed13624a9f17ef3426ef0751de5ba58 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 15 Nov 2022 14:36:15 -0500 Subject: [PATCH 09/38] fixes in substrait, rename in LICENSE, owning scalars --- LICENSE.txt | 2 +- cpp/src/arrow/array/builder_binary.h | 3 +- .../engine/substrait/expression_internal.cc | 9 +++ .../arrow/engine/substrait/type_internal.cc | 7 +++ cpp/src/arrow/scalar.cc | 9 --- cpp/src/arrow/scalar.h | 32 ++++------ cpp/src/arrow/util/string_header.h | 60 +++++++++---------- 7 files changed, 55 insertions(+), 67 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index d355854e2c8..06347f5445d 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1978,7 +1978,7 @@ License: http://www.apache.org/licenses/LICENSE-2.0 This project includes code from Velox. - * cpp/src/arrow/util/bytes_header.h + * cpp/src/arrow/util/string_header.h is based on Velox's diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 3b4c0921f4d..21d8cd34485 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -508,8 +508,7 @@ class ARROW_EXPORT StringHeapBuilder { /// UnsafeAppend operations without the need to allocate more memory Status Reserve(int64_t num_bytes) { if (num_bytes > current_remaining_bytes_) { - current_remaining_bytes_ = - num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize; + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, AllocateBuffer(current_remaining_bytes_, pool_)); current_out_buffer_ = new_block->mutable_data(); diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 5e214bdda4d..c595051dec9 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -710,6 +710,15 @@ struct ScalarToProtoImpl { s); } + Status Visit(const StringViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_string(std::move(s)); }, + s); + } + Status Visit(const BinaryViewScalar& s) { + return FromBuffer([](Lit* lit, std::string&& s) { lit->set_binary(std::move(s)); }, + s); + } + Status Visit(const FixedSizeBinaryScalar& s) { return FromBuffer( [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 03d1f999a14..89692df7bed 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -262,6 +262,13 @@ struct DataTypeToProtoImpl { return SetWith(&substrait::Type::set_allocated_binary); } + Status Visit(const StringViewType& t) { + return SetWith(&::substrait::Type::set_allocated_string); + } + Status Visit(const BinaryViewType& t) { + return SetWith(&::substrait::Type::set_allocated_binary); + } + Status Visit(const FixedSizeBinaryType& t) { SetWithThen(&substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); return Status::OK(); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 622c767a443..33a63cd0427 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -71,12 +71,6 @@ struct ScalarHashImpl { Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); } - Status Visit(const BinaryViewScalar& s) { - const StringHeader& v = s.value; - hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size()); - return Status::OK(); - } - template Status Visit(const TemporalScalar& s) { return ValueHash(s); @@ -565,9 +559,6 @@ Status Scalar::ValidateFull() const { BinaryScalar::BinaryScalar(std::string s) : BinaryScalar(Buffer::FromString(std::move(s))) {} -StringScalar::StringScalar(std::string s) - : StringScalar(Buffer::FromString(std::move(s))) {} - LargeBinaryScalar::LargeBinaryScalar(std::string s) : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index c1c08571a6a..27c0cc7c1ae 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -255,7 +255,6 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { return value ? std::string_view(*value) : std::string_view(); } - protected: BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} }; @@ -264,9 +263,6 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; - BinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} @@ -282,37 +278,29 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr value) : StringScalar(std::move(value), utf8()) {} - explicit StringScalar(std::string s); - StringScalar() : StringScalar(utf8()) {} }; -struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase { - using internal::PrimitiveScalarBase::PrimitiveScalarBase; +struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { + using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryViewType; - explicit BinaryViewScalar(StringHeader value, std::shared_ptr type) - : internal::PrimitiveScalarBase(std::move(type), true), value(value) {} - - explicit BinaryViewScalar(StringHeader value) - : BinaryViewScalar(value, binary_view()) {} - - BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {} - - void* mutable_data() override { return reinterpret_cast(&this->value); } + explicit BinaryViewScalar(std::shared_ptr value) + : BinaryViewScalar(std::move(value), binary_view()) {} - std::string_view view() const override { return std::string_view(this->value); } + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} - StringHeader value; + std::string_view view() const override { return std::string_view(*this->value); } }; struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using BinaryViewScalar::BinaryViewScalar; using TypeClass = StringViewType; - explicit StringViewScalar(StringHeader value) - : BinaryViewScalar(std::move(value), utf8_view()) {} + explicit StringViewScalar(std::shared_ptr value) + : StringViewScalar(std::move(value), utf8_view()) {} - StringViewScalar() : BinaryViewScalar(utf8_view()) {} + StringViewScalar() : StringViewScalar(utf8_view()) {} }; struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h index 29f378a580a..8ba18a83660 100644 --- a/cpp/src/arrow/util/string_header.h +++ b/cpp/src/arrow/util/string_header.h @@ -33,6 +33,7 @@ #pragma once +#include #include #include #include @@ -69,35 +70,27 @@ struct StringHeader { static constexpr size_t kPrefixSize = 4; static constexpr size_t kInlineSize = 12; - StringHeader() { - static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes"); - ; - memset(this, 0, sizeof(StringHeader)); - } + StringHeader() = default; - explicit StringHeader(uint32_t size) : size_(size) { - memset(prefix_, 0, kPrefixSize); - value_.data = nullptr; + static StringHeader makeInline(uint32_t size, char** data) { + assert(size <= kInlineSize); + StringHeader s; + s.size_ = size; + *data = const_cast(s.data()); + return s; } - StringHeader(const char* data, size_t len) : size_(len) { + StringHeader(const char* data, size_t len) : size_(static_cast(len)) { + if (size_ == 0) return; + // TODO: better option than assert? - assert(data || size_ == 0); + assert(data); if (IsInline()) { - // Zero the inline part. - // this makes sure that inline strings can be compared for equality with 2 - // int64 compares. - memset(prefix_, 0, kPrefixSize); - if (size_ == 0) { - return; - } - // small string: inlined. Zero the last 8 bytes first to allow for whole - // word comparison. - value_.data = nullptr; - memcpy(prefix_, data, size_); + // small string: inlined. Bytes beyond size_ are already 0 + memcpy(prefix_.data(), data, size_); } else { // large string: store pointer - memcpy(prefix_, data, kPrefixSize); + memcpy(prefix_.data(), data, kPrefixSize); value_.data = data; } } @@ -112,19 +105,20 @@ struct StringHeader { // StringHeader bh = "literal"; // std::optional obh = "literal"; // - /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {} + // NOLINTNEXTLINE runtime/explicit + StringHeader(const char* data) : StringHeader(data, strlen(data)) {} explicit StringHeader(const std::string& value) : StringHeader(value.data(), value.size()) {} - explicit StringHeader(const std::string_view& value) + explicit StringHeader(std::string_view value) : StringHeader(value.data(), value.size()) {} bool IsInline() const { return IsInline(size_); } static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; } - const char* data() const { return IsInline() ? prefix_ : value_.data; } + const char* data() const { return IsInline() ? prefix_.data() : value_.data; } size_t size() const { return size_; } @@ -160,7 +154,7 @@ struct StringHeader { if (PrefixAsInt() != other.PrefixAsInt()) { // The result is decided on prefix. The shorter will be less // because the prefix is padded with zeros. - return memcmp(prefix_, other.prefix_, kPrefixSize); + return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize); } int32_t size = std::min(size_, other.size_) - kPrefixSize; if (size <= 0) { @@ -168,7 +162,7 @@ struct StringHeader { return size_ - other.size_; } if (static_cast(size) <= kInlineSize && IsInline() && other.IsInline()) { - int32_t result = memcmp(value_.inlined, other.value_.inlined, size); + int32_t result = memcmp(value_.inlined.data(), other.value_.inlined.data(), size); return (result != 0) ? result : size_ - other.size_; } int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size); @@ -183,9 +177,7 @@ struct StringHeader { bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; } - operator std::string() const { return std::string(data(), size()); } - - std::string GetString() const { return *this; } + std::string GetString() const { return std::string(data(), size()); } explicit operator std::string_view() const { return std::string_view(data(), size()); } @@ -208,12 +200,14 @@ struct StringHeader { // We rely on all members being laid out top to bottom . C++ // guarantees this. - uint32_t size_; - char prefix_[4]; + uint32_t size_ = 0; + std::array prefix_ = {0}; union { - char inlined[8]; + std::array inlined = {0}; const char* data; } value_; }; +static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes"); + } // namespace arrow From 8511bf11789ca4b9c144464122a27d5d51460457 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 18 Nov 2022 09:16:11 -0500 Subject: [PATCH 10/38] delete potentially internal viewing members for rvalues --- cpp/src/arrow/util/string_header.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h index 8ba18a83660..e3e9d9d69cd 100644 --- a/cpp/src/arrow/util/string_header.h +++ b/cpp/src/arrow/util/string_header.h @@ -83,7 +83,7 @@ struct StringHeader { StringHeader(const char* data, size_t len) : size_(static_cast(len)) { if (size_ == 0) return; - // TODO: better option than assert? + // TODO(bkietz) better option than assert? assert(data); if (IsInline()) { // small string: inlined. Bytes beyond size_ are already 0 @@ -118,7 +118,8 @@ struct StringHeader { static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; } - const char* data() const { return IsInline() ? prefix_.data() : value_.data; } + const char* data() const& { return IsInline() ? prefix_.data() : value_.data; } + const char* data() && = delete; size_t size() const { return size_; } @@ -179,11 +180,14 @@ struct StringHeader { std::string GetString() const { return std::string(data(), size()); } - explicit operator std::string_view() const { return std::string_view(data(), size()); } + explicit operator std::string_view() const& { return std::string_view(data(), size()); } + operator std::string_view() && = delete; - const char* begin() const { return data(); } + const char* begin() const& { return data(); } + const char* end() const& { return data() + size(); } - const char* end() const { return data() + size(); } + const char* begin() && = delete; + const char* end() && = delete; bool empty() const { return size() == 0; } From 6df010f60fe6d8302c36a7bae4d2d2e582cf8e67 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 18 Nov 2022 13:07:57 -0500 Subject: [PATCH 11/38] Added validation for StringView arrays --- cpp/src/arrow/array/array_base.cc | 4 +- cpp/src/arrow/array/array_binary.h | 38 ++++- cpp/src/arrow/array/array_binary_test.cc | 67 ++++++-- cpp/src/arrow/array/array_test.cc | 4 +- cpp/src/arrow/array/builder_base.cc | 17 +- cpp/src/arrow/array/builder_binary.h | 4 +- cpp/src/arrow/array/util.cc | 28 +++- cpp/src/arrow/array/validate.cc | 147 ++++++++++++++++-- cpp/src/arrow/compare.cc | 8 +- .../compute/kernels/scalar_nested_test.cc | 3 + .../compute/kernels/scalar_string_test.cc | 10 +- cpp/src/arrow/compute/kernels/vector_hash.cc | 94 +++-------- cpp/src/arrow/scalar.cc | 20 +-- cpp/src/arrow/scalar.h | 18 ++- cpp/src/arrow/testing/gtest_util.h | 6 +- cpp/src/arrow/type.h | 11 +- 16 files changed, 331 insertions(+), 148 deletions(-) diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 5e84d928256..76e977a8716 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -88,7 +88,9 @@ struct ScalarFromArraySlotImpl { } Status Visit(const BinaryViewArray& a) { - return Status::NotImplemented("ScalarFromArraySlot -> BinaryView"); + StringHeader header = a.Value(index_); + std::string_view view{header}; + return Finish(std::string{view}); } Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 03ee77fab8b..1c8947dde3a 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -230,16 +230,37 @@ class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { explicit BinaryViewArray(const std::shared_ptr& data); - BinaryViewArray(int64_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, + /// By default, ValidateFull() will check each view in a BinaryViewArray or + /// StringViewArray to ensure it references a memory range owned by one of the array's + /// buffers. + /// + /// If the last character buffer is null, ValidateFull will skip this step. Use this + /// for arrays which view memory elsewhere. + static BufferVector DoNotValidateViews(BufferVector char_buffers) { + char_buffers.push_back(NULLPTR); + return char_buffers; + } + + static bool OptedOutOfViewValidation(const ArrayData& data) { + return data.buffers.back() == NULLPTR; + } + bool OptedOutOfViewValidation() const { return OptedOutOfViewValidation(*data_); } + + BinaryViewArray(int64_t length, std::shared_ptr data, BufferVector char_buffers, + std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : PrimitiveArray(binary_view(), length, data, null_bitmap, null_count, offset) {} + : PrimitiveArray(binary_view(), length, std::move(data), std::move(null_bitmap), + null_count, offset) { + for (auto& char_buffer : char_buffers) { + data_->buffers.push_back(std::move(char_buffer)); + } + } const StringHeader* raw_values() const { return reinterpret_cast(raw_values_) + data_->offset; } - StringHeader Value(int64_t i) const { return raw_values()[i]; } + const StringHeader& Value(int64_t i) const { return raw_values()[i]; } // For API compatibility with BinaryArray etc. std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); } @@ -264,10 +285,13 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray { explicit StringViewArray(const std::shared_ptr& data); - StringViewArray(int64_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, + StringViewArray(int64_t length, std::shared_ptr data, BufferVector char_buffers, + std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : BinaryViewArray(utf8_view(), length, data, null_bitmap, null_count, offset) {} + : BinaryViewArray(length, std::move(data), std::move(char_buffers), + std::move(null_bitmap), null_count, offset) { + data_->type = utf8_view(); + } /// \brief Validate that this array contains only valid UTF8 entries /// diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index c9f1b1cfab0..92fc16f7759 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -32,6 +32,7 @@ #include "arrow/status.h" #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -365,38 +366,73 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); } +TEST(StringViewArray, Validate) { + auto MakeArray = [](std::vector headers, BufferVector char_buffers) { + auto length = static_cast(headers.size()); + return StringViewArray(length, Buffer::Wrap(std::move(headers)), + std::move(char_buffers)); + }; + + // empty array is valid + EXPECT_THAT(MakeArray({}, {}).ValidateFull(), Ok()); + + // inline views need not have a corresponding buffer + EXPECT_THAT(MakeArray({"hello", "world", "inline me"}, {}).ValidateFull(), Ok()); + + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + // non-inline views are expected to reside in a buffer managed by the array + EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), + StringHeader(std::string_view{*buffer_y})}, + {buffer_s, buffer_y}) + .ValidateFull(), + Ok()); + + EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), + // if a view points outside the buffers, that is invalid + StringHeader("from a galaxy far, far away"), + StringHeader(std::string_view{*buffer_y})}, + {buffer_s, buffer_y}) + .ValidateFull(), + Raises(StatusCode::Invalid)); + + // ... unless specifically overridden + EXPECT_THAT( + MakeArray({"from a galaxy far, far away"}, StringViewArray::DoNotValidateViews({})) + .ValidateFull(), + Ok()); +} + template class TestUTF8Array : public ::testing::Test { public: using TypeClass = T; - using offset_type = typename TypeClass::offset_type; using ArrayType = typename TypeTraits::ArrayType; - Status ValidateUTF8(int64_t length, std::vector offsets, - std::string_view data, int64_t offset = 0) { - ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), - /*null_bitmap=*/nullptr, /*null_count=*/0, offset); - return arr.ValidateUTF8(); + Status ValidateUTF8(const Array& arr) { + return checked_cast(arr).ValidateUTF8(); } - Status ValidateUTF8(const std::string& json) { - auto ty = TypeTraits::type_singleton(); - auto arr = ArrayFromJSON(ty, json); - return checked_cast(*arr).ValidateUTF8(); + Status ValidateUTF8(std::vector values) { + std::shared_ptr arr; + ArrayFromVector(values, &arr); + return ValidateUTF8(*arr); } void TestValidateUTF8() { - ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])")); - ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff + ASSERT_OK(ValidateUTF8(*ArrayFromJSON(TypeTraits::type_singleton(), + R"(["Voix", "ambiguë", "d’un", "cœur"])"))); + ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff - ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4")); + ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"})); // More tests in TestValidateData() above // (ValidateFull() calls ValidateUTF8() internally) } }; -TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes); +TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes); TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); } @@ -908,9 +944,6 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -using BinaryAndBin = ::testing::Types; - TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 602a468fafb..7f64aa6d676 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -594,12 +594,14 @@ static ScalarVector GetScalars() { std::make_shared(60, duration(TimeUnit::SECOND)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), @@ -646,7 +648,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_EQ(array->null_count(), 0); // test case for ARROW-13321 - for (int64_t i : std::vector{0, length / 2, length - 1}) { + for (int64_t i : {int64_t{0}, length / 2, length - 1}) { ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i)); AssertScalarsEqual(*s, *scalar, /*verbose=*/true); } diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 70da1fbb296..9cdfb0b4681 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -104,10 +104,7 @@ namespace { template struct AppendScalarImpl { template - enable_if_t::value || is_decimal_type::value || - is_fixed_size_binary_type::value, - Status> - Visit(const T&) { + Status HandleFixedWidth(const T&) { auto builder = checked_cast::BuilderType*>(builder_); RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); @@ -125,7 +122,17 @@ struct AppendScalarImpl { } template - enable_if_base_binary Visit(const T&) { + enable_if_t::value, Status> Visit(const T& t) { + return HandleFixedWidth(t); + } + + Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } + + template + enable_if_t::value || is_string_like_type::value, Status> + Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 21d8cd34485..86df360fec9 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -580,7 +580,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { Status Append(StringHeader value) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(value); - UnsafeAppendToBitmap(true); return Status::OK(); } @@ -595,7 +594,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { value = data_heap_builder_.UnsafeAppend(value, length); } UnsafeAppend(StringHeader(value, length)); - UnsafeAppendToBitmap(true); } void UnsafeAppend(const char* value, int64_t length) { @@ -657,7 +655,7 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { } void UnsafeAppendEmptyValue() { - data_builder_.UnsafeAppend(StringHeader("")); + data_builder_.UnsafeAppend(StringHeader()); UnsafeAppendToBitmap(true); } diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 0135fbb0049..191c039d021 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -380,6 +380,10 @@ class NullArrayFactory { return MaxOf(sizeof(typename T::offset_type) * (length_ + 1)); } + Status Visit(const BinaryViewType& type) { + return MaxOf(sizeof(StringHeader) * length_); + } + Status Visit(const FixedSizeListType& type) { return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_)); } @@ -499,6 +503,11 @@ class NullArrayFactory { return Status::OK(); } + Status Visit(const BinaryViewType&) { + out_->buffers.resize(2, buffer_); + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& type) { out_->buffers.resize(2, buffer_); @@ -646,14 +655,27 @@ class RepeatedArrayFactory { RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer)); auto size = static_cast(value->size()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared::ArrayType>(length_, offsets_buffer, - values_buffer); + out_ = std::make_shared::ArrayType>( + length_, std::move(offsets_buffer), std::move(values_buffer)); return Status::OK(); } template enable_if_binary_view_like Visit(const T&) { - return Status::NotImplemented("binary / string view"); + const std::shared_ptr& value = + checked_cast::ScalarType&>(scalar_).value; + + StringHeader header{std::string_view{*value}}; + std::shared_ptr header_buffer; + RETURN_NOT_OK(CreateBufferOf(&header, sizeof(header), &header_buffer)); + + BufferVector char_buffers; + if (!header.IsInline()) { + char_buffers.push_back(value); + } + out_ = std::make_shared::ArrayType>( + length_, std::move(header_buffer), std::move(char_buffers)); + return Status::OK(); } template diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 7c08d8da38e..da497b2dccb 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -31,6 +31,7 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" @@ -43,10 +44,7 @@ namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } + Status Visit(const DataType&) { Unreachable("utf-8 validation of non string type"); } Status Visit(const StringViewType&) { util::InitializeUTF8(); @@ -87,10 +85,7 @@ struct BoundsChecker { int64_t min_value; int64_t max_value; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } + Status Visit(const DataType&) { Unreachable("bounds checking of non integer type"); } template enable_if_integer Visit(const IntegerType&) { @@ -261,9 +256,7 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } - Status Visit(const BinaryViewType& type) { - return Status::NotImplemented("binary / string view"); - } + Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); } Status Visit(const ListType& type) { return ValidateListLike(type); } @@ -470,7 +463,14 @@ struct ValidateArrayImpl { return Status::Invalid("Array length is negative"); } - if (data.buffers.size() != layout.buffers.size()) { + if (layout.variadic_spec) { + if (data.buffers.size() < layout.buffers.size()) { + return Status::Invalid("Expected at least ", layout.buffers.size(), + " buffers in array " + "of type ", + type.ToString(), ", got ", data.buffers.size()); + } + } else if (data.buffers.size() != layout.buffers.size()) { return Status::Invalid("Expected ", layout.buffers.size(), " buffers in array " "of type ", @@ -486,7 +486,9 @@ struct ValidateArrayImpl { for (int i = 0; i < static_cast(data.buffers.size()); ++i) { const auto& buffer = data.buffers[i]; - const auto& spec = layout.buffers[i]; + const auto& spec = i < static_cast(layout.buffers.size()) + ? layout.buffers[i] + : *layout.variadic_spec; if (buffer == nullptr) { continue; @@ -612,6 +614,125 @@ struct ValidateArrayImpl { return Status::OK(); } + Status ValidateBinaryView(const BinaryViewType& type) { + int64_t headers_byte_size = data.buffers[1]->size(); + int64_t required_headers = data.length + data.offset; + if (static_cast(headers_byte_size / sizeof(StringHeader)) < + required_headers) { + return Status::Invalid("Header buffer size (bytes): ", headers_byte_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + + if (!full_validation || BinaryViewArray::OptedOutOfViewValidation(data)) { + return Status::OK(); + } + + auto* headers = data.GetValues(1); + std::string_view buffer_containing_previous_view; + + auto IsSubrangeOf = [](std::string_view super, std::string_view sub) { + return super.data() <= sub.data() && + super.data() + super.size() <= sub.data() + sub.size(); + }; + + std::vector buffers; + for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) { + buffers.emplace_back(**it); + } + + auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) { + if constexpr (check_previous_buffer) { + buffer_containing_previous_view = buffers.front(); + } + + for (int64_t i = 0; i < data.length; ++i) { + if (headers[i].IsInline()) continue; + + std::string_view view{headers[i]}; + + if constexpr (check_previous_buffer) { + if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) { + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + continue; + } + } + + if (!in_a_buffer(view)) { + return Status::Invalid( + "String view at slot ", i, + " views memory not resident in any buffer managed by the array"); + } + } + return Status::OK(); + }; + + if (buffers.empty()) { + // there are no character buffers; the only way this array + // can be valid is if all views are inline + return CheckViews([](std::string_view) { return std::false_type{}; }, + /*check_previous_buffer=*/std::false_type{}); + } + + // Simplest check for view-in-buffer: loop through buffers and check each one. + auto Linear = [&](std::string_view view) { + for (std::string_view buffer : buffers) { + if (IsSubrangeOf(buffer, view)) { + buffer_containing_previous_view = buffer; + return true; + } + } + return false; + }; + + if (buffers.size() <= 32) { + // If there are few buffers to search through, sorting/binary search is not + // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. + return CheckViews(Linear, + /*check_previous_buffer=*/std::true_type{}); + } + + auto DataPtrLess = [](std::string_view l, std::string_view r) { + return l.data() < r.data(); + }; + + std::sort(buffers.begin(), buffers.end(), DataPtrLess); + bool non_overlapping = + buffers.end() != + std::adjacent_find(buffers.begin(), buffers.end(), + [](std::string_view before, std::string_view after) { + return before.data() + before.size() <= after.data(); + }); + if (ARROW_PREDICT_FALSE(!non_overlapping)) { + // Using a binary search with overlapping buffers would not *uniquely* identify + // a potentially-containing buffer. Moreover this should be a fairly rare case + // so optimizing for it seems premature. + return CheckViews(Linear, + /*check_previous_buffer=*/std::true_type{}); + } + + // More sophisticated check for view-in-buffer: binary search through the buffers. + return CheckViews( + [&](std::string_view view) { + // Find the first buffer whose data starts after the data in view- + // only buffers *before* this could contain view. Since we've additionally + // checked that the buffers do not overlap, only the buffer *immediately before* + // this could contain view. + auto one_past_potential_super = + std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess); + + if (one_past_potential_super == buffers.begin()) return false; + + auto potential_super = *(one_past_potential_super - 1); + if (!IsSubrangeOf(potential_super, view)) return false; + + buffer_containing_previous_view = potential_super; + return true; + }, + /*check_previous_buffer=*/std::true_type{}); + } + template Status ValidateListLike(const ListType& type) { const ArrayData& values = *data.child_data[0]; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 99d87a2d1b6..0f58f927ca4 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -807,19 +807,13 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } template - typename std::enable_if::value, Status>::type + enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); } - Status Visit(const BinaryViewScalar& left) { - const auto& right = checked_cast(right_); - result_ = left.value == right.value; - return Status::OK(); - } - Status Visit(const Decimal128Scalar& left) { const auto& right = checked_cast(right_); result_ = left.value == right.value; diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b..44d3f3a447c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -862,6 +862,9 @@ TEST(MakeStruct, Array) { EXPECT_THAT(MakeStructor({i32, str}, {"i", "s"}), ResultWith(Datum(*StructArray::Make({i32, str}, field_names)))); + EXPECT_THAT(*MakeScalar("aa"), testing::Eq(StringScalar("aa"))); + EXPECT_EQ(*MakeStructor({i32, MakeScalar("aa")}, {"i", "s"})->type(), + StructType({field("i", i32->type()), field("s", str->type())})); // Scalars are broadcast to the length of the arrays EXPECT_THAT(MakeStructor({i32, MakeScalar("aa")}, {"i", "s"}), ResultWith(Datum(*StructArray::Make({i32, str}, field_names)))); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 4581e6377a7..50daec6cbaa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -48,7 +48,6 @@ namespace compute { template class BaseTestStringKernels : public ::testing::Test { protected: - using OffsetType = typename TypeTraits::OffsetType; using ScalarType = typename TypeTraits::ScalarType; void CheckUnary(std::string func_name, std::string json_input, @@ -98,7 +97,14 @@ class BaseTestStringKernels : public ::testing::Test { } std::shared_ptr offset_type() { - return TypeTraits::type_singleton(); + if constexpr (is_binary_view_like_type::value) { + // Views do not have offsets, but Functions like binary_length + // will return the length as uint32 + return uint32(); + } else { + using OffsetType = typename TypeTraits::OffsetType; + return TypeTraits::type_singleton(); + } } template diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 2eab7ae8afa..9f20b640271 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -261,7 +262,7 @@ class HashKernel : public KernelState { // Base class for all "regular" hash kernel implementations // (NullType has a separate implementation) -template class RegularHashKernel : public HashKernel { public: @@ -501,39 +502,13 @@ class DictionaryHashKernel : public HashKernel { }; // ---------------------------------------------------------------------- - -template -struct HashKernelTraits {}; - -template -struct HashKernelTraits> { - using HashKernel = NullHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -Result> HashInitImpl(KernelContext* ctx, - const KernelInitArgs& args) { - using HashKernelType = typename HashKernelTraits::HashKernel; - auto result = std::make_unique(args.inputs[0].GetSharedPtr(), - args.options, ctx->memory_pool()); - RETURN_NOT_OK(result->Reset()); - return std::move(result); -} - -template +template Result> HashInit(KernelContext* ctx, const KernelInitArgs& args) { - return HashInitImpl(ctx, args); + auto result = std::make_unique(args.inputs[0].GetSharedPtr(), args.options, + ctx->memory_pool()); + RETURN_NOT_OK(result->Reset()); + return std::move(result); } template @@ -542,22 +517,22 @@ KernelInit GetHashInit(Type::type type_id) { // representation switch (type_id) { case Type::NA: - return HashInit; + return HashInit>; case Type::BOOL: - return HashInit; + return HashInit>; case Type::INT8: case Type::UINT8: - return HashInit; + return HashInit>; case Type::INT16: case Type::UINT16: - return HashInit; + return HashInit>; case Type::INT32: case Type::UINT32: case Type::FLOAT: case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: - return HashInit; + return HashInit>; case Type::INT64: case Type::UINT64: case Type::DOUBLE: @@ -566,22 +541,23 @@ KernelInit GetHashInit(Type::type type_id) { case Type::TIMESTAMP: case Type::DURATION: case Type::INTERVAL_DAY_TIME: - return HashInit; + return HashInit>; case Type::BINARY: case Type::STRING: - return HashInit; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return HashInit>; case Type::LARGE_BINARY: case Type::LARGE_STRING: - return HashInit; + return HashInit>; case Type::FIXED_SIZE_BINARY: case Type::DECIMAL128: case Type::DECIMAL256: - return HashInit; + return HashInit>; case Type::INTERVAL_MONTH_DAY_NANO: - return HashInit; + return HashInit>; default: - DCHECK(false); - return nullptr; + Unreachable("non hashable type"); } } @@ -591,31 +567,11 @@ template Result> DictionaryHashInit(KernelContext* ctx, const KernelInitArgs& args) { const auto& dict_type = checked_cast(*args.inputs[0].type); - Result> indices_hasher; - switch (dict_type.index_type()->id()) { - case Type::INT8: - case Type::UINT8: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT16: - case Type::UINT16: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT32: - case Type::UINT32: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT64: - case Type::UINT64: - indices_hasher = HashInitImpl(ctx, args); - break; - default: - DCHECK(false) << "Unsupported dictionary index type"; - break; - } - RETURN_NOT_OK(indices_hasher); - return std::make_unique(std::move(indices_hasher.ValueOrDie()), - dict_type.value_type()); + ARROW_ASSIGN_OR_RAISE(auto indices_hasher, + GetHashInit(dict_type.index_type()->id())(ctx, args)); + return std::make_unique( + checked_pointer_cast(std::move(indices_hasher)), + dict_type.value_type()); } Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 33a63cd0427..c20f7396990 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -263,13 +263,11 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } - Status Visit(const BinaryViewScalar& s) { - return Status::NotImplemented("Binary view"); - } + Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); } - Status Visit(const StringViewScalar& s) { - return Status::NotImplemented("String view"); - } + Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); } + + Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); } Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } @@ -556,14 +554,8 @@ Status Scalar::ValidateFull() const { return ScalarValidateImpl(/*full_validation=*/true).Validate(*this); } -BinaryScalar::BinaryScalar(std::string s) - : BinaryScalar(Buffer::FromString(std::move(s))) {} - -LargeBinaryScalar::LargeBinaryScalar(std::string s) - : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} - -LargeStringScalar::LargeStringScalar(std::string s) - : LargeStringScalar(Buffer::FromString(std::move(s))) {} +BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 27c0cc7c1ae..97a6b4787d4 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -257,6 +257,8 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} + + BaseBinaryScalar(std::string s, std::shared_ptr type); }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { @@ -266,7 +268,7 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s); + explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} }; @@ -278,6 +280,8 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr value) : StringScalar(std::move(value), utf8()) {} + explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {} + StringScalar() : StringScalar(utf8()) {} }; @@ -288,6 +292,9 @@ struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { explicit BinaryViewScalar(std::shared_ptr value) : BinaryViewScalar(std::move(value), binary_view()) {} + explicit BinaryViewScalar(std::string s) + : BaseBinaryScalar(std::move(s), binary_view()) {} + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} std::string_view view() const override { return std::string_view(*this->value); } @@ -300,6 +307,9 @@ struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { explicit StringViewScalar(std::shared_ptr value) : StringViewScalar(std::move(value), utf8_view()) {} + explicit StringViewScalar(std::string s) + : BinaryViewScalar(std::move(s), utf8_view()) {} + StringViewScalar() : StringViewScalar(utf8_view()) {} }; @@ -313,7 +323,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} - explicit LargeBinaryScalar(std::string s); + explicit LargeBinaryScalar(std::string s) + : BaseBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} }; @@ -325,7 +336,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { explicit LargeStringScalar(std::shared_ptr value) : LargeStringScalar(std::move(value), large_utf8()) {} - explicit LargeStringScalar(std::string s); + explicit LargeStringScalar(std::string s) + : LargeBinaryScalar(std::move(s), large_utf8()) {} LargeStringScalar() : LargeStringScalar(large_utf8()) {} }; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 332b4d0df88..35b60174178 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -177,12 +177,16 @@ using BaseBinaryArrowTypes = ::testing::Types; using BaseBinaryOrBinaryViewLikeArrowTypes = - ::testing::Types; + ::testing::Types; using BinaryArrowTypes = ::testing::Types; using StringArrowTypes = ::testing::Types; +using StringOrStringViewArrowTypes = + ::testing::Types; + using ListArrowTypes = ::testing::Types; using UnionArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index fb9b80d9f34..e9b171d9d88 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -114,8 +114,14 @@ struct ARROW_EXPORT DataTypeLayout { std::vector buffers; /// Whether this type expects an associated dictionary array. bool has_dictionary = false; + /// If this is provided, the number of buffers expected is only lower-bounded by + /// buffers.size(). Buffers beyond this lower bound are expected to conform to + /// variadic_spec. + std::optional variadic_spec; - explicit DataTypeLayout(std::vector v) : buffers(std::move(v)) {} + explicit DataTypeLayout(std::vector buffers, + std::optional variadic_spec = {}) + : buffers(std::move(buffers)), variadic_spec(variadic_spec) {} }; /// \brief Base class for all data types @@ -725,7 +731,8 @@ class ARROW_EXPORT BinaryViewType : public DataType { DataTypeLayout layout() const override { return DataTypeLayout( - {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))}); + {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))}, + DataTypeLayout::VariableWidth()); } std::string ToString() const override; From 5c24fd545eb29b948a19e36926ff358a983d7662 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 18 Nov 2022 17:04:47 -0500 Subject: [PATCH 12/38] Adding comparison and concatenation --- cpp/src/arrow/array/builder_binary.h | 12 ++++++++---- cpp/src/arrow/array/concatenate.cc | 24 +++++++++++++++++++++++- cpp/src/arrow/array/concatenate_test.cc | 8 ++++++++ cpp/src/arrow/compare.cc | 8 +++++++- cpp/src/arrow/testing/random.cc | 14 +++++++++++--- cpp/src/arrow/testing/random.h | 16 ++++++++++++++++ 6 files changed, 73 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 86df360fec9..d4f835ad1e6 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -546,9 +546,16 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { public: using TypeClass = BinaryViewType; - BinaryViewBuilder(const std::shared_ptr& type, MemoryPool* pool) + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool) : BinaryViewBuilder(pool) {} + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool) {} + int64_t current_block_bytes_remaining() const { return data_heap_builder_.current_remaining_bytes(); } @@ -687,9 +694,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { std::shared_ptr type() const override { return binary_view(); } protected: - explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {} - static constexpr int64_t ValueSizeLimit() { return std::numeric_limits::max(); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index f2abaed4a46..a5175eb31cc 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -230,7 +230,29 @@ class ConcatenateImpl { } Status Visit(const BinaryViewType&) { - return Status::NotImplemented("binary / string view"); + bool any_opted_out_of_view_validation = false; + out_->buffers.resize(2); + + for (const auto& in_data : in_) { + auto begin = in_data->buffers.begin() + 2; + auto end = in_data->buffers.end(); + + if (BinaryViewArray::OptedOutOfViewValidation(*in_data)) { + any_opted_out_of_view_validation = true; + --end; + } + + for (auto it = begin; it != end; ++it) { + out_->buffers.push_back(*it); + } + } + + if (any_opted_out_of_view_validation) { + out_->buffers = BinaryViewArray::DoNotValidateViews(std::move(out_->buffers)); + } + + ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader))); + return ConcatenateBuffers(header_buffers, pool_).Value(&out_->buffers[1]); } Status Visit(const ListType&) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 4c03fab731f..35a75420d92 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -92,6 +92,7 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); auto slices = this->Slices(array, offsets); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); @@ -155,6 +156,13 @@ TEST_F(ConcatenateTest, StringType) { }); } +TEST_F(ConcatenateTest, StringViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + ASSERT_OK((**out).ValidateFull()); + }); +} + TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 0f58f927ca4..dae1fc6f97a 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -263,7 +263,13 @@ class RangeDataEqualsImpl { // Also matches StringViewType Status Visit(const BinaryViewType& type) { - return Status::NotImplemented("Binary / string view"); + auto* left_values = left_.GetValues(1) + left_start_idx_; + auto* right_values = right_.GetValues(1) + right_start_idx_; + VisitValidRuns([&](int64_t i, int64_t length) { + return std::equal(left_values + i, left_values + i + length, + right_values + i, right_values + i + length); + }); + return Status::OK(); } // Also matches LargeStringType diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b8ea247a437..1aec5840eb2 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -362,13 +362,12 @@ std::shared_ptr RandomArrayGenerator::Decimal256(std::shared_ptr +template static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { - using offset_type = typename TypeClass::offset_type; using BuilderType = typename TypeTraits::BuilderType; using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; @@ -386,7 +385,7 @@ static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int /*null_probability=*/0); std::vector str_buffer(max_length); - BuilderType builder(memory_pool, alignment); + BuilderType builder{memory_pool, alignment}; for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -429,6 +428,15 @@ std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( return *strings->View(binary()); } +std::shared_ptr RandomArrayGenerator::StringView(int64_t size, int32_t min_length, + int32_t max_length, + double null_probability, + int64_t alignment, + MemoryPool* memory_pool) { + return GenerateBinaryArray(this, size, min_length, max_length, + null_probability, alignment, memory_pool); +} + std::shared_ptr RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 1bd189c39c2..999afdc60fd 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random StringViewArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] null_probability the probability of a value being null + /// + /// \return a generated Array + std::shared_ptr StringView(int64_t size, int32_t min_length, int32_t max_length, + double null_probability = 0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeStringArray /// /// \param[in] size the size of the array to generate From 190648c7f9ed77d42bf766017470280b79f7ebcf Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Sat, 19 Nov 2022 21:28:48 -0500 Subject: [PATCH 13/38] wrote <=, needed >= --- cpp/src/arrow/array/concatenate_test.cc | 2 +- cpp/src/arrow/array/validate.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 35a75420d92..c074db8a886 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -92,7 +92,7 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); - ASSERT_OK(array->ValidateFull()); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); auto slices = this->Slices(array, offsets); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index da497b2dccb..a0526acc93b 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -633,7 +633,7 @@ struct ValidateArrayImpl { auto IsSubrangeOf = [](std::string_view super, std::string_view sub) { return super.data() <= sub.data() && - super.data() + super.size() <= sub.data() + sub.size(); + super.data() + super.size() >= sub.data() + sub.size(); }; std::vector buffers; @@ -661,7 +661,7 @@ struct ValidateArrayImpl { if (!in_a_buffer(view)) { return Status::Invalid( - "String view at slot ", i, + "String view at slot ", i, " @", (std::uintptr_t)view.data(), " views memory not resident in any buffer managed by the array"); } } From 018b49f169ed7ce940ee4922e19177f54b38831e Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 28 Nov 2022 14:09:59 -0500 Subject: [PATCH 14/38] Extract visitation of views owning buffers --- cpp/src/arrow/array/array_binary_test.cc | 8 + cpp/src/arrow/array/array_test.cc | 52 ++--- cpp/src/arrow/array/validate.cc | 242 +++++++++++++---------- cpp/src/arrow/buffer.h | 8 + cpp/src/arrow/testing/random.cc | 10 + 5 files changed, 190 insertions(+), 130 deletions(-) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 92fc16f7759..f21abf681f9 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -389,6 +389,14 @@ TEST(StringViewArray, Validate) { .ValidateFull(), Ok()); + // overlapping views and buffers are allowed + EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), + StringHeader(std::string_view{*buffer_s}.substr(5, 5)), + StringHeader(std::string_view{*buffer_s}.substr(9, 4))}, + {buffer_s, SliceBuffer(buffer_s, 1, 1), SliceBuffer(buffer_s, 3, 6)}) + .ValidateFull(), + Ok()); + EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), // if a view points outside the buffers, that is invalid StringHeader("from a galaxy far, far away"), diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 7f64aa6d676..ff87e0b8ddb 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -722,32 +722,32 @@ TEST_F(TestArray, TestMakeEmptyArray) { FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))}); std::vector union_type_codes{7, 42}; - std::shared_ptr types[] = {null(), - boolean(), - int8(), - uint16(), - int32(), - uint64(), - float64(), - binary(), - large_binary(), - fixed_size_binary(3), - decimal(16, 4), - utf8(), - large_utf8(), - list(utf8()), - list(int64()), - large_list(large_utf8()), - fixed_size_list(utf8(), 3), - fixed_size_list(int64(), 4), - dictionary(int32(), utf8()), - struct_({field("a", utf8()), field("b", int32())}), - sparse_union(union_fields1, union_type_codes), - sparse_union(union_fields2, union_type_codes), - dense_union(union_fields1, union_type_codes), - dense_union(union_fields2, union_type_codes)}; - - for (auto type : types) { + for (auto type : {null(), + boolean(), + int8(), + uint16(), + int32(), + uint64(), + float64(), + binary(), + binary_view(), + large_binary(), + fixed_size_binary(3), + decimal(16, 4), + utf8(), + utf8_view(), + large_utf8(), + list(utf8()), + list(int64()), + large_list(large_utf8()), + fixed_size_list(utf8(), 3), + fixed_size_list(int64(), 4), + dictionary(int32(), utf8()), + struct_({field("a", utf8()), field("b", int32())}), + sparse_union(union_fields1, union_type_codes), + sparse_union(union_fields2, union_type_codes), + dense_union(union_fields1, union_type_codes), + dense_union(union_fields2, union_type_codes)}) { ARROW_SCOPED_TRACE("type = ", type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type)); ASSERT_OK(array->ValidateFull()); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index a0526acc93b..90ddd45161f 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -31,13 +31,141 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" #include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace internal { +namespace arrow::internal { + +/// visitor will be called once for each non-inlined StringHeader. +/// It will be passed the index of each non-inlined StringHeader, +/// as well as a `const shared_ptr&` of the buffer +/// wherein the viewed memory resides, or nullptr if the viewed memory +/// is not in a buffer managed by this array. +template +Status VisitNonInlinedViewsAndOwningBuffers(const ArrayData& data, + const Visitor& visitor) { + auto* headers = data.buffers[1]->data_as(); + + static const std::shared_ptr kNullBuffer = nullptr; + + if (data.buffers.size() == 2 || + (data.buffers.size() == 3 && data.buffers.back() == nullptr)) { + // there are no character buffers, just visit a null buffer + for (int64_t i = 0; i < data.length; ++i) { + if (headers[i].IsInline()) continue; + RETURN_NOT_OK(visitor(i, kNullBuffer)); + } + return Status::OK(); + } + + auto IsSubrangeOf = [](std::string_view super, StringHeader sub) { + return super.data() <= sub.data() && + super.data() + super.size() >= sub.data() + sub.size(); + }; + + std::vector buffers; + std::vector*> owning_buffers; + for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) { + if (*it != nullptr) { + buffers.emplace_back(**it); + owning_buffers.push_back(&*it); + } + } + + const int not_found = static_cast(buffers.size()); + + auto DoVisit = [&](auto get_buffer) { + DCHECK(!buffers.empty()); + + // owning_buffers[not_found] points to the null placeholder + owning_buffers.push_back(&kNullBuffer); + + std::string_view buffer_containing_previous_view = buffers.front(); + int buffer_i = 0; + + for (int64_t i = 0; i < data.length; ++i) { + if (headers[i].IsInline()) continue; + + if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, headers[i]))) { + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + } else { + buffer_i = get_buffer(headers[i]); + if (buffer_i != not_found) { + // if we didn't find a buffer which owns headers[i], we can hope + // that there was just one out of line string and check + // buffer_containing_previous_view next iteration + buffer_containing_previous_view = buffers[buffer_i]; + } + } + + RETURN_NOT_OK(visitor(i, *owning_buffers[buffer_i])); + } + return Status::OK(); + }; + + // Simplest check for view-in-buffer: loop through buffers and check each one. + auto Linear = [&](StringHeader view) { + int i = 0; + for (std::string_view buffer : buffers) { + if (IsSubrangeOf(buffer, view)) return i; + ++i; + } + return not_found; + }; + + if (buffers.size() <= 32) { + // If there are few buffers to search through, sorting/binary search is not + // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. + return DoVisit(Linear); + } + + auto DataPtrLess = [](std::string_view l, std::string_view r) { + return l.data() < r.data(); + }; + + { + auto sort_indices = ArgSort(buffers, DataPtrLess); + Permute(sort_indices, &buffers); + Permute(sort_indices, &owning_buffers); + } + + bool non_overlapping = + buffers.end() != + std::adjacent_find(buffers.begin(), buffers.end(), + [](std::string_view before, std::string_view after) { + return before.data() + before.size() <= after.data(); + }); + if (ARROW_PREDICT_FALSE(!non_overlapping)) { + // Using a binary search with overlapping buffers would not *uniquely* identify + // a potentially-containing buffer. Moreover this should be a fairly rare case + // so optimizing for it seems premature. + return DoVisit(Linear); + } + + // More sophisticated check for view-in-buffer: binary search through the buffers. + return DoVisit([&](StringHeader view) { + // Find the first buffer whose data starts after the data in view- + // only buffers *before* this could contain view. Since we've additionally + // checked that the buffers do not overlap, only the buffer *immediately before* + // this could contain view. + int one_past_potential_super = + static_cast(std::upper_bound(buffers.begin(), buffers.end(), + std::string_view{view}, DataPtrLess) - + buffers.begin()); + + if (one_past_potential_super == 0) return not_found; + + int i = one_past_potential_super - 1; + if (IsSubrangeOf(buffers[i], view)) return i; + + return not_found; + }); +} namespace { @@ -628,109 +756,16 @@ struct ValidateArrayImpl { return Status::OK(); } - auto* headers = data.GetValues(1); - std::string_view buffer_containing_previous_view; - - auto IsSubrangeOf = [](std::string_view super, std::string_view sub) { - return super.data() <= sub.data() && - super.data() + super.size() >= sub.data() + sub.size(); - }; - - std::vector buffers; - for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) { - buffers.emplace_back(**it); - } - - auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) { - if constexpr (check_previous_buffer) { - buffer_containing_previous_view = buffers.front(); - } - - for (int64_t i = 0; i < data.length; ++i) { - if (headers[i].IsInline()) continue; + return VisitNonInlinedViewsAndOwningBuffers( + data, [&](int64_t i, const std::shared_ptr& owner) { + if (ARROW_PREDICT_TRUE(owner != nullptr)) return Status::OK(); - std::string_view view{headers[i]}; - - if constexpr (check_previous_buffer) { - if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) { - // Fast path: for most string view arrays, we'll have runs - // of views into the same buffer. - continue; - } - } - - if (!in_a_buffer(view)) { + auto* ptr = data.buffers[1]->data_as()[i].data(); return Status::Invalid( - "String view at slot ", i, " @", (std::uintptr_t)view.data(), + "String view at slot ", i, " @", + arrow::HexEncode(reinterpret_cast(&ptr), sizeof(ptr)), " views memory not resident in any buffer managed by the array"); - } - } - return Status::OK(); - }; - - if (buffers.empty()) { - // there are no character buffers; the only way this array - // can be valid is if all views are inline - return CheckViews([](std::string_view) { return std::false_type{}; }, - /*check_previous_buffer=*/std::false_type{}); - } - - // Simplest check for view-in-buffer: loop through buffers and check each one. - auto Linear = [&](std::string_view view) { - for (std::string_view buffer : buffers) { - if (IsSubrangeOf(buffer, view)) { - buffer_containing_previous_view = buffer; - return true; - } - } - return false; - }; - - if (buffers.size() <= 32) { - // If there are few buffers to search through, sorting/binary search is not - // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. - return CheckViews(Linear, - /*check_previous_buffer=*/std::true_type{}); - } - - auto DataPtrLess = [](std::string_view l, std::string_view r) { - return l.data() < r.data(); - }; - - std::sort(buffers.begin(), buffers.end(), DataPtrLess); - bool non_overlapping = - buffers.end() != - std::adjacent_find(buffers.begin(), buffers.end(), - [](std::string_view before, std::string_view after) { - return before.data() + before.size() <= after.data(); - }); - if (ARROW_PREDICT_FALSE(!non_overlapping)) { - // Using a binary search with overlapping buffers would not *uniquely* identify - // a potentially-containing buffer. Moreover this should be a fairly rare case - // so optimizing for it seems premature. - return CheckViews(Linear, - /*check_previous_buffer=*/std::true_type{}); - } - - // More sophisticated check for view-in-buffer: binary search through the buffers. - return CheckViews( - [&](std::string_view view) { - // Find the first buffer whose data starts after the data in view- - // only buffers *before* this could contain view. Since we've additionally - // checked that the buffers do not overlap, only the buffer *immediately before* - // this could contain view. - auto one_past_potential_super = - std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess); - - if (one_past_potential_super == buffers.begin()) return false; - - auto potential_super = *(one_past_potential_super - 1); - if (!IsSubrangeOf(potential_super, view)) return false; - - buffer_containing_previous_view = potential_super; - return true; - }, - /*check_previous_buffer=*/std::true_type{}); + }); } template @@ -943,5 +978,4 @@ Status ValidateUTF8(const ArrayData& data) { ARROW_EXPORT Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 65f1abda161..ac231e72342 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -186,6 +186,10 @@ class ARROW_EXPORT Buffer { #endif return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR; } + template + const T* data_as() const { + return reinterpret_cast(data()); + } /// \brief Return a writable pointer to the buffer's data /// @@ -203,6 +207,10 @@ class ARROW_EXPORT Buffer { return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast(data_) : NULLPTR; } + template + T* mutable_data_as() { + return reinterpret_cast(mutable_data()); + } /// \brief Return the device address of the buffer's data uintptr_t address() const { return reinterpret_cast(data_); } diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 1aec5840eb2..8aa3c781365 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -850,6 +850,16 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t ->View(field.type()); } + case Type::type::STRING_VIEW: + case Type::type::BINARY_VIEW: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + return *StringView(length, min_length, max_length, null_probability) + ->View(field.type()); + } + case Type::type::DECIMAL128: return Decimal128(field.type(), length, null_probability, alignment, memory_pool); From 0ee9d896e95371fffcf2c313abbd04a7c119501e Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 29 Nov 2022 16:02:29 -0500 Subject: [PATCH 15/38] add cast to/from string_view --- cpp/src/arrow/array/data.cc | 13 +- cpp/src/arrow/array/data.h | 21 +- cpp/src/arrow/array/validate.cc | 62 ++-- cpp/src/arrow/compute/exec.cc | 3 + .../compute/kernels/scalar_cast_internal.cc | 12 +- .../compute/kernels/scalar_cast_internal.h | 3 - .../compute/kernels/scalar_cast_numeric.cc | 4 +- .../compute/kernels/scalar_cast_string.cc | 277 ++++++++++++------ .../arrow/compute/kernels/scalar_cast_test.cc | 66 +++-- 9 files changed, 300 insertions(+), 161 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 8764e9c354c..ec42e90b9a3 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -186,7 +186,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { } this->offset = data.offset; - for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + for (int i = 0; i < std::min(static_cast(data.buffers.size()), 3); ++i) { const std::shared_ptr& buffer = data.buffers[i]; // It is the invoker-of-kernels's responsibility to ensure that // const buffers are not written to accidentally. @@ -346,6 +346,17 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; + } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { + const auto& scalar = checked_cast(value); + this->buffers[1].data = reinterpret_cast(this->scratch_space); + if (scalar.is_valid) { + *reinterpret_cast(this->buffers[1].data) = {scalar.value->data(), + scalar.value->size()}; + this->buffers[2].data = const_cast(scalar.value->data()); + this->buffers[2].size = scalar.value->size(); + } else { + *reinterpret_cast(this->buffers[1].data) = {}; + } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 82a6e733727..6294f65a858 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -468,10 +468,10 @@ struct ARROW_EXPORT ArraySpan { void SetSlice(int64_t offset, int64_t length) { this->offset = offset; this->length = length; - if (this->type->id() != Type::NA) { - this->null_count = kUnknownNullCount; - } else { + if (this->type->id() == Type::NA) { this->null_count = this->length; + } else if (this->MayHaveNulls()) { + this->null_count = kUnknownNullCount; } } @@ -547,6 +547,21 @@ struct ARROW_EXPORT ArraySpan { namespace internal { +template +Status VisitSlices(ArraySpan input, int64_t slice_size, const F& f) { + int64_t num_slices = input.length / slice_size; + int64_t trailing_slice_size = input.length % slice_size; + int64_t offset = input.offset; + + for (int64_t i = 0; i < num_slices; ++i) { + input.SetSlice(offset, slice_size); + ARROW_RETURN_NOT_OK(f(input)); + offset += slice_size; + } + input.SetSlice(offset, trailing_slice_size); + return f(input); +} + void FillZeroLengthArray(const DataType* type, ArraySpan* span); /// Construct a zero-copy view of this ArrayData with the given type. diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 90ddd45161f..4d80c4d92e1 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -172,39 +172,29 @@ namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { Unreachable("utf-8 validation of non string type"); } - - Status Visit(const StringViewType&) { - util::InitializeUTF8(); - - const auto* values = data.GetValues(1); - for (int64_t i = 0; i < data.length; ++i) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8( - reinterpret_cast(values[i].data()), values[i].size()))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } + template + Status Visit(const T&) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + util::InitializeUTF8(); + + int64_t i = 0; + return VisitArraySpanInline( + data, + [&](std::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } else { + Unreachable("utf-8 validation of non string type"); } - return Status::OK(); - } - - template - enable_if_string Visit(const StringType&) { - util::InitializeUTF8(); - - int64_t i = 0; - return VisitArraySpanInline( - data, - [&](std::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); } }; @@ -305,6 +295,14 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const StringViewType& type) { + RETURN_NOT_OK(ValidateBinaryView(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } + Status Visit(const Date64Type& type) { RETURN_NOT_OK(ValidateFixedWidthBuffers()); diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index c18dfa09522..b93c4a6c847 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -302,6 +302,9 @@ void ComputeDataPreallocate(const DataType& type, case Type::LARGE_LIST: widths->emplace_back(64, /*added_length=*/1); return; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + widths->emplace_back(8 * sizeof(StringHeader)); default: break; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 8cf5a04addb..5a671cd05c8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -170,12 +170,6 @@ Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) return Status::OK(); } -Result ResolveOutputFromOptions(KernelContext* ctx, - const std::vector&) { - const CastOptions& options = checked_cast(*ctx->state()).options; - return options.to_type; -} - /// You will see some of kernels with /// /// kOutputTargetType @@ -184,8 +178,10 @@ Result ResolveOutputFromOptions(KernelContext* ctx, /// easiest initial way to get the requested cast type including the TimeUnit /// to the kernel (which is needed to compute the output) was through /// CastOptions - -OutputType kOutputTargetType(ResolveOutputFromOptions); +OutputType kOutputTargetType([](KernelContext* ctx, + const std::vector&) -> Result { + return CastState::Get(ctx).to_type; +}); Status ZeroCopyCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // TODO(wesm): alternative strategy for zero copy casts after ARROW-16576 diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h index 0a57e3381d3..c32a26cc948 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h @@ -71,9 +71,6 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty CastFunction* func); // OutputType::Resolver that returns a type the type from CastOptions -Result ResolveOutputFromOptions(KernelContext* ctx, - const std::vector& args); - ARROW_EXPORT extern OutputType kOutputTargetType; // Add generic casts to out_ty from: diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index a02f83351b3..c9bbcc94a2e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -726,7 +726,7 @@ std::shared_ptr GetCastToFloating(std::string name) { } std::shared_ptr GetCastToDecimal128() { - OutputType sig_out_ty(ResolveOutputFromOptions); + OutputType sig_out_ty = kOutputTargetType; auto func = std::make_shared("cast_decimal", Type::DECIMAL128); AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get()); @@ -761,7 +761,7 @@ std::shared_ptr GetCastToDecimal128() { } std::shared_ptr GetCastToDecimal256() { - OutputType sig_out_ty(ResolveOutputFromOptions); + OutputType sig_out_ty = kOutputTargetType; auto func = std::make_shared("cast_decimal256", Type::DECIMAL256); AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index ebeb597207a..ff51b7f8c9b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -25,8 +25,10 @@ #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/result.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8_internal.h" #include "arrow/visit_data_inline.h" @@ -284,107 +286,192 @@ Status CastBinaryToBinaryOffsets(KernelContext* ctx, } template -enable_if_base_binary BinaryToBinaryCastExec(KernelContext* ctx, - const ExecSpan& batch, - ExecResult* out) { - const CastOptions& options = checked_cast(*ctx->state()).options; +Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { const ArraySpan& input = batch[0].array; - if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) { + // This presupposes that one was created in the invocation layer + ArrayData* output = out->array_data().get(); + output->SetNullCount(input.null_count); + + const auto& options = CastState::Get(ctx); + bool check_utf8 = !I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8; + if (check_utf8) { InitializeUTF8(); - ArraySpanVisitor visitor; - Utf8Validator validator; - RETURN_NOT_OK(visitor.Visit(input, &validator)); } - // Start with a zero-copy cast, but change indices to expected size - RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); - return CastBinaryToBinaryOffsets( - ctx, input, out->array_data().get()); -} + auto SimpleUtf8Validation = [&] { + if (check_utf8) { + Utf8Validator validator; + return ArraySpanVisitor::Visit(input, &validator); + } + return Status::OK(); + }; -template -enable_if_t::value && - !std::is_same::value, - Status> -BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const CastOptions& options = checked_cast(*ctx->state()).options; - const ArraySpan& input = batch[0].array; + constexpr bool kInputOffsets = + std::is_base_of_v || std::is_base_of_v; - if (O::is_utf8 && !options.allow_invalid_utf8) { - InitializeUTF8(); - ArraySpanVisitor visitor; - Utf8Validator validator; - RETURN_NOT_OK(visitor.Visit(input, &validator)); + constexpr bool kInputViews = std::is_base_of_v; + + constexpr bool kInputFixed = std::is_same_v; + + constexpr bool kOutputOffsets = + std::is_base_of_v || std::is_base_of_v; + + constexpr bool kOutputViews = std::is_base_of_v; + + constexpr bool kOutputFixed = std::is_same_v; + + if constexpr (kInputOffsets && kOutputOffsets) { + // FIXME(bkietz) this discards preallocated storage. It seems preferable to me to + // allocate a new null bitmap if necessary than to always allocate new offsets. + // Start with a zero-copy cast, but change indices to expected size + RETURN_NOT_OK(SimpleUtf8Validation()); + RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); + return CastBinaryToBinaryOffsets( + ctx, input, out->array_data().get()); } - // Check for overflow - using output_offset_type = typename O::offset_type; - constexpr output_offset_type kMaxOffset = - std::numeric_limits::max(); - const int32_t width = input.type->byte_width(); - const int64_t max_offset = width * input.length; - if (max_offset > kMaxOffset) { - return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", - out->type()->ToString(), ": input array too large"); + if constexpr (kInputViews && kOutputViews) { + return SimpleUtf8Validation() & ZeroCopyCastExec(ctx, batch, out); } - // This presupposes that one was created in the invocation layer - ArrayData* output = out->array_data().get(); + if constexpr (kInputViews && kOutputOffsets) { + // FIXME(bkietz) this discards preallocated offset storage + typename TypeTraits::BuilderType builder{ctx->memory_pool()}; - // Copy buffers over, then generate indices - output->length = input.length; - output->SetNullCount(input.null_count); - if (input.offset == output->offset) { - output->buffers[0] = input.GetBuffer(0); - } else { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); - } + RETURN_NOT_OK(builder.Reserve(input.length)); + // TODO(bkietz) if ArraySpan::buffers were a SmallVector, we could have access to all + // the character data buffers here and reserve character data accordingly. + + // sweep through L1-sized chunks to reduce the frequency of allocation + int64_t chunk_size = ctx->exec_context()->cpu_info()->CacheSize( + ::arrow::internal::CpuInfo::CacheLevel::L1) / + sizeof(StringHeader) / 4; + + RETURN_NOT_OK(::arrow::internal::VisitSlices( + input, chunk_size, [&](const ArraySpan& input_slice) { + int64_t num_chars = builder.value_data_length(), num_appended_chars = 0; + VisitArraySpanInline( + input_slice, + [&](std::string_view v) { + num_appended_chars += static_cast(v.size()); + }, + [] {}); + + RETURN_NOT_OK(builder.ReserveData(num_appended_chars)); + + VisitArraySpanInline( + input_slice, [&](std::string_view v) { builder.UnsafeAppend(v); }, + [&] { builder.UnsafeAppendNull(); }); + + if (check_utf8) { + if (ARROW_PREDICT_FALSE(!ValidateUTF8Inline(builder.value_data() + num_chars, + num_appended_chars))) { + return Status::Invalid("Invalid UTF8 sequence"); + } + } + return Status::OK(); + })); - // This buffer is preallocated - output_offset_type* offsets = output->GetMutableValues(1); - offsets[0] = static_cast(input.offset * width); - for (int64_t i = 0; i < input.length; i++) { - offsets[i + 1] = offsets[i] + width; + return builder.FinishInternal(std::get_if>(&out->value)); } - // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary - // (index 2). After ARROW-16757, we need to copy this memory instead of - // zero-copy it because a Scalar value promoted to an ArraySpan may be - // referencing a temporary buffer whose scope does not extend beyond the - // kernel execution. In that scenario, the validity bitmap above can be - // zero-copied because it points to static memory (either a byte with a 1 or - // a 0 depending on whether the value is null or not). - std::shared_ptr input_data = input.GetBuffer(1); - if (input_data != nullptr) { - ARROW_ASSIGN_OR_RAISE(output->buffers[2], input_data->CopySlice(0, input_data->size(), - ctx->memory_pool())); - } else { - // TODO(wesm): it should already be nullptr, so we may be able to remove - // this - output->buffers[2] = nullptr; + if constexpr ((kInputOffsets || kInputFixed) && kOutputViews) { + // we can reuse the data buffer here and just add views which reference it + if (input.MayHaveNulls()) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } + // FIXME(bkietz) segfault due to null buffer owner + // output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2); + + auto* headers = output->buffers[1]->mutable_data_as(); + if (check_utf8) { + Utf8Validator validator; + return VisitArraySpanInline( + input, + [&](std::string_view v) { + *headers++ = StringHeader{v}; + return validator.VisitValue(v); + }, + [&] { + *headers++ = StringHeader{}; + return Status::OK(); + }); + } else { + VisitArraySpanInline( + input, [&](std::string_view v) { *headers++ = StringHeader{v}; }, + [&] { *headers++ = StringHeader{}; }); + return Status::OK(); + } } - return Status::OK(); -} + if constexpr (kInputFixed && kOutputOffsets) { + RETURN_NOT_OK(SimpleUtf8Validation()); -template -enable_if_t::value && - std::is_same::value, - Status> -BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const CastOptions& options = checked_cast(*ctx->state()).options; - const int32_t in_width = batch[0].type()->byte_width(); - const int32_t out_width = - checked_cast(*options.to_type).byte_width(); - if (in_width != out_width) { - return Status::Invalid("Failed casting from ", batch[0].type()->ToString(), " to ", - options.to_type.ToString(), ": widths must match"); + using output_offset_type = typename O::offset_type; + + int32_t width = input.type->byte_width(); + + if constexpr (std::is_same_v) { + // Check for overflow + if (width * input.length > std::numeric_limits::max()) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + out->type()->ToString(), ": input array too large"); + } + } + + // Copy buffers over, then generate indices + output->length = input.length; + output->SetNullCount(input.null_count); + if (input.offset == output->offset) { + output->buffers[0] = input.GetBuffer(0); + } else { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } + + // This buffer is preallocated + auto* offsets = output->buffers[1]->mutable_data_as(); + offsets[0] = static_cast(input.offset * width); + for (int64_t i = 0; i < input.length; i++) { + offsets[i + 1] = offsets[i] + width; + } + + // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary + // (index 2). After ARROW-16757, we need to copy this memory instead of + // zero-copy it because a Scalar value promoted to an ArraySpan may be + // referencing a temporary buffer whose scope does not extend beyond the + // kernel execution. In that scenario, the validity bitmap above can be + // zero-copied because it points to static memory (either a byte with a 1 or + // a 0 depending on whether the value is null or not). + if (std::shared_ptr input_data = input.GetBuffer(1)) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[2], + input_data->CopySlice(0, input_data->size(), ctx->memory_pool())); + } else { + // TODO(wesm): it should already be nullptr, so we may be able to remove + // this + output->buffers[2] = nullptr; + } + + return Status::OK(); } - return ZeroCopyCastExec(ctx, batch, out); + + if constexpr (kInputFixed && kOutputFixed) { + if (input.type->byte_width() != output->type->byte_width()) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + output->type->ToString(), ": widths must match"); + } + return ZeroCopyCastExec(ctx, batch, out); + } + + Unreachable(); } #if defined(_MSC_VER) @@ -447,6 +534,8 @@ template void AddBinaryToBinaryCast(CastFunction* func) { AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); + AddBinaryToBinaryCast(func); + AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); AddBinaryToBinaryCast(func); @@ -459,6 +548,11 @@ std::vector> GetBinaryLikeCasts() { AddCommonCasts(Type::BINARY, binary(), cast_binary.get()); AddBinaryToBinaryCast(cast_binary.get()); + auto cast_binary_view = + std::make_shared("cast_binary_view", Type::BINARY_VIEW); + AddCommonCasts(Type::BINARY_VIEW, binary_view(), cast_binary_view.get()); + AddBinaryToBinaryCast(cast_binary_view.get()); + auto cast_large_binary = std::make_shared("cast_large_binary", Type::LARGE_BINARY); AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get()); @@ -471,6 +565,14 @@ std::vector> GetBinaryLikeCasts() { AddTemporalToStringCasts(cast_string.get()); AddBinaryToBinaryCast(cast_string.get()); + auto cast_string_view = + std::make_shared("cast_string_view", Type::STRING_VIEW); + AddCommonCasts(Type::STRING_VIEW, utf8_view(), cast_string_view.get()); + AddNumberToStringCasts(cast_string_view.get()); + AddDecimalToStringCasts(cast_string_view.get()); + AddTemporalToStringCasts(cast_string_view.get()); + AddBinaryToBinaryCast(cast_string_view.get()); + auto cast_large_string = std::make_shared("cast_large_string", Type::LARGE_STRING); AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get()); @@ -481,15 +583,16 @@ std::vector> GetBinaryLikeCasts() { auto cast_fsb = std::make_shared("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY); - AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions), - cast_fsb.get()); + AddCommonCasts(Type::FIXED_SIZE_BINARY, kOutputTargetType, cast_fsb.get()); DCHECK_OK(cast_fsb->AddKernel( - Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, - OutputType(FirstType), + Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, kOutputTargetType, BinaryToBinaryCastExec, NullHandling::COMPUTED_NO_PREALLOCATE)); - return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb}; + return { + cast_binary, cast_binary_view, cast_large_binary, cast_string, + cast_string_view, cast_large_string, cast_fsb, + }; } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 083a85eb346..5384008b1b7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -145,7 +145,7 @@ static std::shared_ptr MaskArrayWithNullsAt(std::shared_ptr input, using arrow::internal::Bitmap; Bitmap is_valid(masked->buffers[0], 0, input->length()); - if (auto original = input->null_bitmap()) { + if (const auto& original = input->null_bitmap()) { is_valid.CopyFrom(Bitmap(original, input->offset(), input->length())); } else { is_valid.SetBitsTo(true); @@ -154,7 +154,7 @@ static std::shared_ptr MaskArrayWithNullsAt(std::shared_ptr input, for (int i : indices_to_mask) { is_valid.SetBitTo(i, false); } - return MakeArray(masked); + return MakeArray(std::move(masked)); } TEST(Cast, CanCast) { @@ -167,6 +167,9 @@ TEST(Cast, CanCast) { } }; + ExpectCanCast(boolean(), {utf8()}); + return; + auto ExpectCannotCast = [ExpectCanCast](std::shared_ptr from, std::vector> to_set) { ExpectCanCast(from, to_set, /*expected=*/false); @@ -198,17 +201,21 @@ TEST(Cast, CanCast) { ExpectCannotCast(from_numeric, {null()}); } - for (auto from_base_binary : kBaseBinaryTypes) { - ExpectCanCast(from_base_binary, {boolean()}); - ExpectCanCast(from_base_binary, kNumericTypes); - ExpectCanCast(from_base_binary, kBaseBinaryTypes); - ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary}); + auto base_binary_and_view_types = kBaseBinaryTypes; + base_binary_and_view_types.push_back(binary_view()); + base_binary_and_view_types.push_back(utf8_view()); + + for (auto from : base_binary_and_view_types) { + ExpectCanCast(from, {boolean()}); + ExpectCanCast(from, kNumericTypes); + ExpectCanCast(from, base_binary_and_view_types); + ExpectCanCast(dictionary(int64(), from), {from}); // any cast which is valid for the dictionary is valid for the DictionaryArray - ExpectCanCast(dictionary(uint32(), from_base_binary), kBaseBinaryTypes); - ExpectCanCast(dictionary(int16(), from_base_binary), kNumericTypes); + ExpectCanCast(dictionary(uint32(), from), kBaseBinaryTypes); + ExpectCanCast(dictionary(int16(), from), kNumericTypes); - ExpectCannotCast(from_base_binary, {null()}); + ExpectCannotCast(from, {null()}); } ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)}); @@ -1029,7 +1036,7 @@ TEST(Cast, DecimalToFloating) { } TEST(Cast, DecimalToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) { CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"), ArrayFromJSON(string_type, R"(["0.00", null, "123.45", "999.99"])")); @@ -1548,7 +1555,7 @@ TEST(Cast, TimeZeroCopy) { } TEST(Cast, DateToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(date32(), "[0, null]"), ArrayFromJSON(string_type, R"(["1970-01-01", null])")); CheckCast(ArrayFromJSON(date64(), "[86400000, null]"), @@ -1557,7 +1564,7 @@ TEST(Cast, DateToString) { } TEST(Cast, TimeToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"), ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])")); CheckCast( @@ -1567,7 +1574,7 @@ TEST(Cast, TimeToString) { } TEST(Cast, TimestampToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])")); @@ -1593,7 +1600,7 @@ TEST(Cast, TimestampToString) { } TEST_F(CastTimezone, TimestampWithZoneToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"), ArrayFromJSON(string_type, @@ -1779,7 +1786,7 @@ TEST(Cast, DurationToDurationMultiplyOverflow) { } TEST(Cast, DurationToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { for (auto unit : TimeUnit::values()) { CheckCast(ArrayFromJSON(duration(unit), "[0, null, 1234567, 2000]"), ArrayFromJSON(string_type, R"(["0", null, "1234567", "2000"])")); @@ -2016,6 +2023,10 @@ TEST(Cast, StringToTimestamp) { } static void AssertBinaryZeroCopy(std::shared_ptr lhs, std::shared_ptr rhs) { + for (auto id : {lhs->type_id(), rhs->type_id()}) { + // views cannot be zero copied + if (id == Type::BINARY_VIEW || id == Type::STRING_VIEW) return; + } // null bitmap and data buffers are always zero-copied AssertBufferSame(*lhs, *rhs, 0); AssertBufferSame(*lhs, *rhs, 2); @@ -2039,8 +2050,9 @@ static void AssertBinaryZeroCopy(std::shared_ptr lhs, std::shared_ptr empty always works CheckCast(ArrayFromJSON(bin_type, "[]"), ArrayFromJSON(string_type, "[]")); @@ -2058,13 +2070,14 @@ TEST(Cast, BinaryToString) { options.allow_invalid_utf8 = true; ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, string_type, options)); ASSERT_RAISES(Invalid, strings->ValidateFull()); + AssertBinaryZeroCopy(invalid_utf8, strings); } } auto from_type = fixed_size_binary(3); auto invalid_utf8 = FixedSizeInvalidUtf8(from_type); - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(string_type, "[]")); // invalid utf-8 masked by a null bit is not an error @@ -2083,9 +2096,12 @@ TEST(Cast, BinaryToString) { // N.B. null buffer is not always the same if input sliced AssertBufferSame(*invalid_utf8, *strings, 0); - // ARROW-16757: we no longer zero copy, but the contents are equal - ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get()); - ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); + if (string_type->id() != Type::STRING_VIEW) { + // ARROW-16757: we no longer zero copy, but the contents are equal + ASSERT_NE(invalid_utf8->data()->buffers[1].get(), + strings->data()->buffers[2].get()); + ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); + } } } @@ -2154,7 +2170,7 @@ TEST(Cast, StringToString) { } TEST(Cast, IntToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), ArrayFromJSON(string_type, R"(["0", "1", "127", "-128", null])")); @@ -2186,7 +2202,7 @@ TEST(Cast, IntToString) { } TEST(Cast, FloatingToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast( ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); @@ -2198,7 +2214,7 @@ TEST(Cast, FloatingToString) { } TEST(Cast, BooleanToString) { - for (auto string_type : {utf8(), large_utf8()}) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { CheckCast(ArrayFromJSON(boolean(), "[true, true, false, null]"), ArrayFromJSON(string_type, R"(["true", "true", "false", null])")); } From 6ed4ac0f14da92377992634e3d4961daa3fe8fa4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 30 Nov 2022 17:01:23 -0500 Subject: [PATCH 16/38] Adding IPC serde of views by converting to/from dense strings --- cpp/src/arrow/buffer_builder.h | 6 ++ .../compute/kernels/scalar_cast_string.cc | 68 +++++++++++----- cpp/src/arrow/ipc/metadata_internal.cc | 10 ++- cpp/src/arrow/ipc/metadata_internal.h | 2 + cpp/src/arrow/ipc/read_write_test.cc | 79 ++++++++++--------- cpp/src/arrow/ipc/reader.cc | 56 +++++++++++-- cpp/src/arrow/ipc/test_common.cc | 48 +++++------ cpp/src/arrow/ipc/writer.cc | 53 ++++++++++++- cpp/src/arrow/util/string_header.h | 5 +- 9 files changed, 230 insertions(+), 97 deletions(-) diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 5f37e552004..ab397485d60 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -117,6 +117,9 @@ class ARROW_EXPORT BufferBuilder { UnsafeAppend(data, length); return Status::OK(); } + Status Append(std::string_view v) { + return Append(v.data(), static_cast(v.size())); + } /// \brief Append copies of a value to the buffer /// @@ -138,6 +141,9 @@ class ARROW_EXPORT BufferBuilder { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } + void UnsafeAppend(std::string_view v) { + UnsafeAppend(v.data(), static_cast(v.size())); + } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast(num_copies)); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index ff51b7f8c9b..5c4f69bee32 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -323,11 +323,11 @@ Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, constexpr bool kOutputFixed = std::is_same_v; if constexpr (kInputOffsets && kOutputOffsets) { - // FIXME(bkietz) this discards preallocated storage. It seems preferable to me to - // allocate a new null bitmap if necessary than to always allocate new offsets. // Start with a zero-copy cast, but change indices to expected size RETURN_NOT_OK(SimpleUtf8Validation()); RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); + // FIXME(bkietz) this discards preallocated storage. It seems preferable to me to + // allocate a new null bitmap if necessary than to always allocate new offsets. return CastBinaryToBinaryOffsets( ctx, input, out->array_data().get()); } @@ -337,54 +337,86 @@ Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, } if constexpr (kInputViews && kOutputOffsets) { - // FIXME(bkietz) this discards preallocated offset storage - typename TypeTraits::BuilderType builder{ctx->memory_pool()}; + // copy the input's null bitmap if necessary + if (input.MayHaveNulls()) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } else { + output->buffers[0] = nullptr; + } + + using offset_type = typename O::offset_type; + + auto* offsets = output->buffers[1]->mutable_data_as(); + offsets[0] = 0; + auto AppendOffset = [&](size_t size) mutable { + offsets[1] = offsets[0] + static_cast(size); + ++offsets; + }; - RETURN_NOT_OK(builder.Reserve(input.length)); // TODO(bkietz) if ArraySpan::buffers were a SmallVector, we could have access to all // the character data buffers here and reserve character data accordingly. + BufferBuilder char_builder{ctx->memory_pool()}; - // sweep through L1-sized chunks to reduce the frequency of allocation + // sweep through L1-sized chunks to reduce the frequency of allocation and overflow + // checking int64_t chunk_size = ctx->exec_context()->cpu_info()->CacheSize( ::arrow::internal::CpuInfo::CacheLevel::L1) / sizeof(StringHeader) / 4; RETURN_NOT_OK(::arrow::internal::VisitSlices( input, chunk_size, [&](const ArraySpan& input_slice) { - int64_t num_chars = builder.value_data_length(), num_appended_chars = 0; + size_t num_appended_chars = 0; + int64_t num_chars = char_builder.length(); VisitArraySpanInline( - input_slice, - [&](std::string_view v) { - num_appended_chars += static_cast(v.size()); - }, + input_slice, [&](std::string_view v) { num_appended_chars += v.size(); }, [] {}); - RETURN_NOT_OK(builder.ReserveData(num_appended_chars)); + if constexpr (std::is_same_v) { + if (ARROW_PREDICT_FALSE(char_builder.length() + num_appended_chars > + std::numeric_limits::max())) { + return Status::Invalid("Failed casting from ", input.type->ToString(), + " to ", out->type()->ToString(), + ": input array viewed too many characters"); + } + } + RETURN_NOT_OK(char_builder.Reserve(static_cast(num_appended_chars))); VisitArraySpanInline( - input_slice, [&](std::string_view v) { builder.UnsafeAppend(v); }, - [&] { builder.UnsafeAppendNull(); }); + input_slice, + [&](std::string_view v) { + char_builder.UnsafeAppend(v); + AppendOffset(v.size()); + }, + [&] { AppendOffset(0); }); if (check_utf8) { - if (ARROW_PREDICT_FALSE(!ValidateUTF8Inline(builder.value_data() + num_chars, - num_appended_chars))) { + if (ARROW_PREDICT_FALSE( + !ValidateUTF8Inline(char_builder.data() + num_chars, + static_cast(num_appended_chars)))) { return Status::Invalid("Invalid UTF8 sequence"); } } return Status::OK(); })); - return builder.FinishInternal(std::get_if>(&out->value)); + return char_builder.Finish(&output->buffers[2]); } if constexpr ((kInputOffsets || kInputFixed) && kOutputViews) { - // we can reuse the data buffer here and just add views which reference it + // FIXME(bkietz) when outputting views, we *could* output into slices, + // provided we have a threadsafe place to stash accumulated buffers + // of character data. + if (input.MayHaveNulls()) { ARROW_ASSIGN_OR_RAISE( output->buffers[0], arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, input.offset, input.length)); } + // FIXME(bkietz) segfault due to null buffer owner // output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 255bff2241d..46b163b829f 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -536,11 +536,13 @@ class FieldToFlatbufferVisitor { Status Visit(const BinaryViewType& type) { // BinaryView will be written to IPC as a normal binary array + extra_type_metadata_[std::string{kSerializedStringViewKeyName}] = ""; return Visit(BinaryType()); } Status Visit(const StringViewType& type) { // StringView will be written to IPC as a normal UTF8 string array + extra_type_metadata_[std::string{kSerializedStringViewKeyName}] = ""; return Visit(StringType()); } @@ -830,7 +832,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos, dictionary_id = encoding->id(); } - // 4. Is it an extension type? + // 4. Is it an extension or view type? if (metadata != nullptr) { // Look for extension metadata in custom_metadata field int name_index = metadata->FindKey(kExtensionTypeKeyName); @@ -851,6 +853,12 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos, } // NOTE: if extension type is unknown, we do not raise here and // simply return the storage type. + } else if (name_index = metadata->FindKey(std::string{kSerializedStringViewKeyName}); + name_index != -1) { + DCHECK(type->id() == Type::STRING || type->id() == Type::BINARY); + RETURN_NOT_OK(metadata->Delete(name_index)); + bool is_utf8 = type->id() == Type::STRING; + type = is_utf8 ? utf8_view() : binary_view(); } } diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index abbed5b2dac..6f07a8aea4f 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -245,6 +245,8 @@ flatbuf::TimeUnit ToFlatbufferUnit(TimeUnit::type unit); ARROW_EXPORT TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit); +constexpr std::string_view kSerializedStringViewKeyName = "ARROW:string_view"; + } // namespace internal } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 7de81eff7a7..8d4db5ea0e7 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -240,23 +240,33 @@ class TestSchemaMetadata : public ::testing::Test { } }; -const std::shared_ptr INT32 = std::make_shared(); - TEST_F(TestSchemaMetadata, PrimitiveFields) { - auto f0 = field("f0", std::make_shared()); - auto f1 = field("f1", std::make_shared(), false); - auto f2 = field("f2", std::make_shared()); - auto f3 = field("f3", std::make_shared()); - auto f4 = field("f4", std::make_shared()); - auto f5 = field("f5", std::make_shared()); - auto f6 = field("f6", std::make_shared()); - auto f7 = field("f7", std::make_shared()); - auto f8 = field("f8", std::make_shared()); - auto f9 = field("f9", std::make_shared(), false); - auto f10 = field("f10", std::make_shared()); - - Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", int8()), + field("f1", int16(), false), + field("f2", int32()), + field("f3", int64()), + field("f4", uint8()), + field("f5", uint16()), + field("f6", uint32()), + field("f7", uint64()), + field("f8", float32()), + field("f9", float64(), false), + field("f10", boolean()), + })); +} + +TEST_F(TestSchemaMetadata, BinaryFields) { + CheckSchemaRoundtrip(Schema({ + field("f0", utf8()), + field("f1", binary()), + field("f2", large_utf8()), + field("f3", large_binary()), + field("f4", utf8_view()), + field("f5", binary_view()), + field("f6", fixed_size_binary(3)), + field("f7", fixed_size_binary(33)), + })); } TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { @@ -269,15 +279,14 @@ TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { } TEST_F(TestSchemaMetadata, NestedFields) { - auto type = list(int32()); - auto f0 = field("f0", type); - - std::shared_ptr type2( - new StructType({field("k1", INT32), field("k2", INT32), field("k3", INT32)})); - auto f1 = field("f1", type2); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", list(int32())), + field("f1", struct_({ + field("k1", int32()), + field("k2", int32()), + field("k3", int32()), + })), + })); } // Verify that nullable=false is well-preserved for child fields of map type. @@ -305,19 +314,15 @@ TEST_F(TestSchemaMetadata, NestedFieldsWithKeyValueMetadata) { TEST_F(TestSchemaMetadata, DictionaryFields) { { - auto dict_type = dictionary(int8(), int32(), true /* ordered */); - auto f0 = field("f0", dict_type); - auto f1 = field("f1", list(dict_type)); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + auto dict_type = dictionary(int8(), int32(), /*ordered=*/true); + CheckSchemaRoundtrip(Schema({ + field("f0", dict_type), + field("f1", list(dict_type)), + })); } { auto dict_type = dictionary(int8(), list(int32())); - auto f0 = field("f0", dict_type); - - Schema schema({f0}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } } @@ -325,9 +330,7 @@ TEST_F(TestSchemaMetadata, NestedDictionaryFields) { { auto inner_dict_type = dictionary(int8(), int32(), /*ordered=*/true); auto dict_type = dictionary(int16(), list(inner_dict_type)); - - Schema schema({field("f0", dict_type)}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } { auto dict_type1 = dictionary(int8(), utf8(), /*ordered=*/true); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 4577a416523..aa57d6c70de 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -58,6 +58,7 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/ubsan.h" #include "arrow/util/vector.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" #include "generated/File_generated.h" // IWYU pragma: export @@ -289,7 +290,6 @@ class ArrayLoader { return Status::OK(); } - template Status LoadBinary(Type::type type_id) { out_->buffers.resize(3); @@ -345,12 +345,13 @@ class ArrayLoader { template enable_if_base_binary Visit(const T& type) { - return LoadBinary(type.id()); + return LoadBinary(type.id()); } Status Visit(const BinaryViewType& type) { - DCHECK(false); - return Status::NotImplemented("Reading IPC format to binary view is not supported"); + // View arrays are serialized as the corresponding dense array. + // We can't produce the view array yet; the buffers may still be compressed. + return LoadBinary(type.id() == Type::STRING_VIEW ? Type::STRING : Type::BINARY); } Status Visit(const FixedSizeBinaryType& type) { @@ -524,6 +525,49 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op }); } +Status ConvertViewArrays(const IpcReadOptions& options, ArrayDataVector* fields) { + struct StringViewAccumulator { + using DataPtrVector = std::vector; + + void AppendFrom(const ArrayDataVector& fields) { + for (const auto& field : fields) { + if (field->type->id() == Type::STRING_VIEW || + field->type->id() == Type::BINARY_VIEW) { + view_arrays_.push_back(field.get()); + } + AppendFrom(field->child_data); + } + } + + DataPtrVector Get(const ArrayDataVector& fields) && { + AppendFrom(fields); + return std::move(view_arrays_); + } + + DataPtrVector view_arrays_; + }; + + auto view_arrays = StringViewAccumulator{}.Get(*fields); + + return ::arrow::internal::OptionalParallelFor( + options.use_threads, static_cast(view_arrays.size()), [&](int i) { + ArrayData* data = view_arrays[i]; + + // the only thing we need to fix here is replacing offsets with headers + ARROW_ASSIGN_OR_RAISE( + auto header_buffer, + AllocateBuffer(data->length * sizeof(StringHeader), options.memory_pool)); + + auto* headers = header_buffer->mutable_data_as(); + VisitArraySpanInline( + *data, [&](std::string_view v) { *headers++ = StringHeader{v}; }, + [&] { *headers++ = StringHeader{}; }); + + data->buffers[1] = std::move(header_buffer); + return Status::OK(); + }); +} + Result> LoadRecordBatchSubset( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, const std::vector* inclusion_mask, const IpcReadContext& context, @@ -572,6 +616,7 @@ Result> LoadRecordBatchSubset( RETURN_NOT_OK( DecompressBuffers(context.compression, context.options, &filtered_columns)); } + RETURN_NOT_OK(ConvertViewArrays(context.options, &filtered_columns)); // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { @@ -821,10 +866,11 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, const Field dummy_field("", value_type); RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get())); + ArrayDataVector dict_fields{dict_data}; if (compression != Compression::UNCOMPRESSED) { - ArrayDataVector dict_fields{dict_data}; RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields)); } + RETURN_NOT_OK(ConvertViewArrays(context.options, &dict_fields)); // swap endian in dict_data if necessary (swap_endian == true) if (context.swap_endian) { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 53721c0b20f..37fb3ed3f84 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -353,37 +353,27 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { const int64_t length = 500; - auto f0 = field("strings", utf8()); - auto f1 = field("binaries", binary()); - auto f2 = field("large_strings", large_utf8()); - auto f3 = field("large_binaries", large_binary()); - auto schema = ::arrow::schema({f0, f1, f2, f3}); - - std::shared_ptr a0, a1, a2, a3; - MemoryPool* pool = default_memory_pool(); - // Quirk with RETURN_NOT_OK macro and templated functions - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a0); - RETURN_NOT_OK(s); + ArrayVector arrays; + FieldVector fields; + + using namespace std::string_literals; + for (auto MakeArray : { + &MakeBinaryArrayWithUniqueValues, + &MakeBinaryArrayWithUniqueValues, + &MakeBinaryArrayWithUniqueValues, + &MakeBinaryArrayWithUniqueValues, + &MakeBinaryArrayWithUniqueValues, + &MakeBinaryArrayWithUniqueValues, + }) { + arrays.emplace_back(); + RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); + + const auto& type = arrays.back()->type(); + fields.push_back(field(type->ToString(), type)); } - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a1); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a2); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a3); - RETURN_NOT_OK(s); - } - *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); + + *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 1b7fb74cb9d..e911abffa34 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -52,10 +52,12 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/endian.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" #include "arrow/visit_array_inline.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -398,6 +400,53 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + // a separate helper doesn't make sense here since we've already done the work + // to copy the bitmap + out_->body_buffers.emplace_back(); + ARROW_ASSIGN_OR_RAISE( + out_->body_buffers.back(), + AllocateBuffer(sizeof(int32_t) * (array.length() + 1), options_.memory_pool)); + + auto* offsets = out_->body_buffers.back()->mutable_data_as(); + offsets[0] = 0; + auto AppendOffset = [&](size_t size) mutable { + // ignore overflow for now + offsets[1] = arrow::internal::SafeSignedAdd(offsets[0], static_cast(size)); + ++offsets; + }; + + int64_t size = 0; + VisitArraySpanInline( + *array.data(), + [&](std::string_view v) { + size += static_cast(v.size()); + AppendOffset(v.size()); + }, + [&] { AppendOffset(0); }); + + if (size > std::numeric_limits::max()) { + return Status::Invalid( + "Input view array viewed more characters than are representable with 32 bit " + "offsets, unable to serialize"); + } + + out_->body_buffers.emplace_back(); + ARROW_ASSIGN_OR_RAISE(out_->body_buffers.back(), + AllocateBuffer(size, options_.memory_pool)); + + VisitArraySpanInline( + *array.data(), + [chars = out_->body_buffers.back()->mutable_data_as()]( + std::string_view v) mutable { + v.copy(chars, v.size()); + chars += v.size(); + }, + [] {}); + + return Status::OK(); + } + template enable_if_base_list Visit(const T& array) { using offset_type = typename T::offset_type; @@ -425,10 +474,6 @@ class RecordBatchSerializer { return Status::OK(); } - Status Visit(const BinaryViewArray& array) { - return Status::NotImplemented("Binary / string view type"); - } - Status Visit(const FixedSizeListArray& array) { --max_recursion_depth_; auto size = array.list_type()->list_size(); diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h index e3e9d9d69cd..182b749f219 100644 --- a/cpp/src/arrow/util/string_header.h +++ b/cpp/src/arrow/util/string_header.h @@ -158,8 +158,9 @@ struct StringHeader { return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize); } int32_t size = std::min(size_, other.size_) - kPrefixSize; - if (size <= 0) { - // One ends within the prefix. + assert(size >= 0); + if (size == 0) { + // One string is just the prefix. return size_ - other.size_; } if (static_cast(size) <= kInlineSize && IsInline() && other.IsInline()) { From a8bf2586efb6af501e76ae7d9a17e4ae24cfebba Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 13 Apr 2023 12:45:01 -0400 Subject: [PATCH 17/38] initial attempt at indices/offsets repr in arrow --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/array_base.cc | 4 +- cpp/src/arrow/array/array_binary.h | 41 +- cpp/src/arrow/array/array_binary_test.cc | 190 +- cpp/src/arrow/array/array_view_test.cc | 32 + cpp/src/arrow/array/builder_binary.cc | 19 +- cpp/src/arrow/array/builder_binary.h | 106 +- cpp/src/arrow/array/concatenate.cc | 37 +- cpp/src/arrow/array/concatenate_test.cc | 15 + cpp/src/arrow/array/data.cc | 59 +- cpp/src/arrow/array/data.h | 4 + cpp/src/arrow/array/dict_internal.h | 54 + cpp/src/arrow/array/util.cc | 266 +- cpp/src/arrow/array/util.h | 21 + cpp/src/arrow/array/validate.cc | 177 +- cpp/src/arrow/builder.cc | 14 +- cpp/src/arrow/compare.cc | 34 +- .../compute/kernels/scalar_cast_string.cc | 98 +- cpp/src/arrow/ipc/feather_test.cc | 6 +- cpp/src/arrow/ipc/json_simple_test.cc | 2 + cpp/src/arrow/ipc/metadata_internal.cc | 52 +- cpp/src/arrow/ipc/metadata_internal.h | 8 +- cpp/src/arrow/ipc/read_write_test.cc | 12 +- cpp/src/arrow/ipc/reader.cc | 78 +- cpp/src/arrow/ipc/test_common.cc | 23 +- cpp/src/arrow/ipc/test_common.h | 3 +- cpp/src/arrow/ipc/writer.cc | 57 +- cpp/src/arrow/ipc/writer.h | 1 + cpp/src/arrow/testing/json_internal.cc | 204 +- cpp/src/arrow/testing/random.cc | 17 +- cpp/src/arrow/testing/random_test.cc | 1 + cpp/src/arrow/testing/util.h | 8 + cpp/src/arrow/type.cc | 22 +- cpp/src/arrow/type.h | 13 +- cpp/src/arrow/type_fwd.h | 4 +- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/range.h | 58 +- cpp/src/arrow/util/sort.h | 15 +- cpp/src/arrow/util/span.h | 115 + cpp/src/arrow/util/string.cc | 10 + cpp/src/arrow/util/string.h | 4 +- .../arrow/util/string_conversion_benchmark.cc | 250 ++ cpp/src/arrow/util/string_header.cc | 29 + cpp/src/arrow/util/string_header.h | 263 +- cpp/src/arrow/visit_data_inline.h | 65 +- cpp/src/generated/File_generated.h | 10 +- cpp/src/generated/Message_generated.h | 35 +- cpp/src/generated/Schema_generated.h | 147 +- cpp/src/generated/SparseTensor_generated.h | 18 + cpp/src/generated/Tensor_generated.h | 16 + cpp/src/generated/feather_generated.h | 7 + cpp/src/parquet/arrow/reader_internal.cc | 2 + cpp/src/parquet/arrow/schema.cc | 6 + cpp/src/parquet/column_writer.cc | 4 +- cpp/src/parquet/encoding.cc | 31 +- cpp/src/parquet/statistics.cc | 4 + cpp/src/plasma/common_generated.h | 230 + cpp/src/plasma/plasma_generated.h | 3984 +++++++++++++++++ dev/archery/archery/integration/datagen.py | 96 + docs/source/format/Columnar.rst | 42 +- format/Message.fbs | 6 + format/Schema.fbs | 14 + 62 files changed, 6483 insertions(+), 662 deletions(-) create mode 100644 cpp/src/arrow/util/span.h create mode 100644 cpp/src/arrow/util/string_conversion_benchmark.cc create mode 100644 cpp/src/arrow/util/string_header.cc create mode 100644 cpp/src/plasma/common_generated.h create mode 100644 cpp/src/plasma/plasma_generated.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 00cf899349a..bd093edaad8 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -226,6 +226,7 @@ set(ARROW_SRCS util/ree_util.cc util/string.cc util/string_builder.cc + util/string_header.cc util/task_group.cc util/tdigest.cc util/thread_pool.cc diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 76e977a8716..07c18998253 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -88,9 +88,7 @@ struct ScalarFromArraySlotImpl { } Status Visit(const BinaryViewArray& a) { - StringHeader header = a.Value(index_); - std::string_view view{header}; - return Finish(std::string{view}); + return Finish(std::string{a.GetView(index_)}); } Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 1c8947dde3a..13277ce1def 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -230,22 +230,6 @@ class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { explicit BinaryViewArray(const std::shared_ptr& data); - /// By default, ValidateFull() will check each view in a BinaryViewArray or - /// StringViewArray to ensure it references a memory range owned by one of the array's - /// buffers. - /// - /// If the last character buffer is null, ValidateFull will skip this step. Use this - /// for arrays which view memory elsewhere. - static BufferVector DoNotValidateViews(BufferVector char_buffers) { - char_buffers.push_back(NULLPTR); - return char_buffers; - } - - static bool OptedOutOfViewValidation(const ArrayData& data) { - return data.buffers.back() == NULLPTR; - } - bool OptedOutOfViewValidation() const { return OptedOutOfViewValidation(*data_); } - BinaryViewArray(int64_t length, std::shared_ptr data, BufferVector char_buffers, std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0) @@ -260,10 +244,19 @@ class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { return reinterpret_cast(raw_values_) + data_->offset; } - const StringHeader& Value(int64_t i) const { return raw_values()[i]; } - // For API compatibility with BinaryArray etc. - std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); } + std::string_view GetView(int64_t i) const { + const auto& s = raw_values()[i]; + if (raw_pointers_) { + return std::string_view{s}; + } + if (s.IsInline()) { + return {s.GetInlineData(), s.size()}; + } + auto* char_buffers = data_->buffers.data() + 2; + return {char_buffers[s.GetBufferIndex()]->data_as() + s.GetBufferOffset(), + s.size()}; + } // EXPERIMENTAL std::optional operator[](int64_t i) const { @@ -273,8 +266,18 @@ class ARROW_EXPORT BinaryViewArray : public PrimitiveArray { IteratorType begin() const { return IteratorType(*this); } IteratorType end() const { return IteratorType(*this, length()); } + bool has_raw_pointers() const { return raw_pointers_; } + protected: using PrimitiveArray::PrimitiveArray; + + void SetData(const std::shared_ptr& data) { + PrimitiveArray::SetData(data); + raw_pointers_ = + internal::checked_cast(*type()).has_raw_pointers(); + } + + bool raw_pointers_ = false; }; /// Concrete Array class for variable-size string view (utf-8) data using diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index f21abf681f9..953ff360a93 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -27,6 +27,7 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" @@ -39,6 +40,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -366,50 +368,176 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); } -TEST(StringViewArray, Validate) { - auto MakeArray = [](std::vector headers, BufferVector char_buffers) { - auto length = static_cast(headers.size()); - return StringViewArray(length, Buffer::Wrap(std::move(headers)), - std::move(char_buffers)); - }; +namespace string_header_helpers { - // empty array is valid - EXPECT_THAT(MakeArray({}, {}).ValidateFull(), Ok()); +StringHeader Inline(std::string_view chars) { + assert(StringHeader::IsInline(chars.size())); + return StringHeader{chars}; +} - // inline views need not have a corresponding buffer - EXPECT_THAT(MakeArray({"hello", "world", "inline me"}, {}).ValidateFull(), Ok()); +StringHeader NotInline(std::string_view prefix, size_t length, size_t buffer_index, + size_t offset) { + assert(prefix.size() == 4); + assert(!StringHeader::IsInline(length)); + StringHeader s{prefix.data(), length}; + s.SetIndexOffset(buffer_index, offset); + return s; +} + +Result> MakeArray( + BufferVector char_buffers, const std::vector& headers, + bool validate = true) { + auto length = static_cast(headers.size()); + ARROW_ASSIGN_OR_RAISE(auto headers_buf, CopyBufferFromVector(headers)); + auto arr = std::make_shared(length, std::move(headers_buf), + std::move(char_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +Result> MakeArrayFromRawPointerViews( + BufferVector char_buffers, const std::vector& raw) { + ARROW_ASSIGN_OR_RAISE(auto raw_buf, CopyBufferFromVector(raw)); + StringViewArray raw_arr{static_cast(raw.size()), std::move(raw_buf), + char_buffers}; + raw_arr.data()->type = utf8_view(/*has_raw_pointers=*/true); + + ARROW_ASSIGN_OR_RAISE(auto io_buf, AllocateBuffer(raw.size() * sizeof(StringHeader))); + RETURN_NOT_OK(internal::SwapStringHeaderPointers( + *raw_arr.data(), io_buf->mutable_data_as())); + + auto arr = std::make_shared(raw.size(), std::move(io_buf), + std::move(char_buffers)); + RETURN_NOT_OK(arr->ValidateFull()); + return arr; +} + +} // namespace string_header_helpers +TEST(StringViewArray, Validate) { + using string_header_helpers::Inline; + using string_header_helpers::MakeArray; + using string_header_helpers::NotInline; + + // Since this is a test of validation, we need to be able to construct invalid arrays. auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); - // non-inline views are expected to reside in a buffer managed by the array - EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), - StringHeader(std::string_view{*buffer_y})}, - {buffer_s, buffer_y}) - .ValidateFull(), + // empty array is valid + EXPECT_THAT(MakeArray({}, {}), Ok()); + + // empty array with some character buffers is valid + EXPECT_THAT(MakeArray({buffer_s, buffer_y}, {}), Ok()); + + // inline views need not have a corresponding buffer + EXPECT_THAT(MakeArray({}, {Inline("hello"), Inline("world"), Inline("inline me")}), Ok()); - // overlapping views and buffers are allowed - EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), - StringHeader(std::string_view{*buffer_s}.substr(5, 5)), - StringHeader(std::string_view{*buffer_s}.substr(9, 4))}, - {buffer_s, SliceBuffer(buffer_s, 1, 1), SliceBuffer(buffer_s, 3, 6)}) - .ValidateFull(), + // non-inline views are expected to reference only buffers managed by the array + EXPECT_THAT( + MakeArray({buffer_s, buffer_y}, {NotInline("supe", buffer_s->size(), 0, 0), + NotInline("yyyy", buffer_y->size(), 1, 0)}), + Ok()); + + // views may not reference char buffers not present in the array + EXPECT_THAT(MakeArray({}, {NotInline("supe", buffer_s->size(), 0, 0)}), + Raises(StatusCode::IndexError)); + // ... or ranges which overflow the referenced char buffer + EXPECT_THAT(MakeArray({buffer_s}, {NotInline("supe", buffer_s->size() + 50, 0, 0)}), + Raises(StatusCode::IndexError)); + + // Additionally, the prefixes of non-inline views must match the character buffer + EXPECT_THAT( + MakeArray({buffer_s, buffer_y}, {NotInline("SUPE", buffer_s->size(), 0, 0), + NotInline("yyyy", buffer_y->size(), 1, 0)}), + Raises(StatusCode::Invalid)); + + // Invalid string views which are masked by a null bit do not cause validation to fail + auto invalid_but_masked = MakeArray({buffer_s}, + {NotInline("SUPE", buffer_s->size(), 0, 0), + NotInline("yyyy", 50, 40, 30)}, + /*validate=*/false) + .ValueOrDie() + ->data(); + invalid_but_masked->null_count = 2; + invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2); + EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok()); + + // overlapping views are allowed + EXPECT_THAT(MakeArray({buffer_s}, + { + NotInline("supe", buffer_s->size(), 0, 0), + NotInline("uper", buffer_s->size() - 1, 0, 1), + NotInline("perc", buffer_s->size() - 2, 0, 2), + NotInline("erca", buffer_s->size() - 3, 0, 3), + }), Ok()); +} + +TEST(StringViewArray, BinaryViewArrayFromRawPointerViews) { + using string_header_helpers::Inline; + using string_header_helpers::MakeArray; + using string_header_helpers::MakeArrayFromRawPointerViews; + using string_header_helpers::NotInline; + + auto Roundtrip = [&](Result> maybe_arr) { + ARROW_ASSIGN_OR_RAISE(auto arr, maybe_arr); + + std::vector raw(arr->length()); + RETURN_NOT_OK( + internal::SwapStringHeaderPointers(*arr->data(), raw.data())); + for (size_t i = 0; i < raw.size(); ++i) { + if (std::string_view{raw[i]} != arr->GetView(i)) { + return Status::Invalid("Produced incorrect raw pointer headers"); + } + } + + BufferVector char_buffers{arr->data()->buffers.begin() + 2, + arr->data()->buffers.end()}; + ARROW_ASSIGN_OR_RAISE(auto round_tripped, + MakeArrayFromRawPointerViews(std::move(char_buffers), raw)); - EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}), - // if a view points outside the buffers, that is invalid - StringHeader("from a galaxy far, far away"), - StringHeader(std::string_view{*buffer_y})}, - {buffer_s, buffer_y}) - .ValidateFull(), - Raises(StatusCode::Invalid)); + if (round_tripped->Equals(arr)) { + return Status::OK(); + } + return Status::Invalid("not equal"); + }; - // ... unless specifically overridden EXPECT_THAT( - MakeArray({"from a galaxy far, far away"}, StringViewArray::DoNotValidateViews({})) - .ValidateFull(), + Roundtrip(MakeArray({}, {Inline("hello"), Inline("world"), Inline("inline me")})), Ok()); + + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + EXPECT_THAT(Roundtrip(MakeArray({buffer_s, buffer_y}, + { + NotInline("supe", buffer_s->size(), 0, 0), + Inline("hello"), + NotInline("yyyy", buffer_y->size(), 1, 0), + Inline("world"), + NotInline("uper", buffer_s->size() - 1, 0, 1), + })), + Ok()); + + // use a larger number of buffers to test the binary search case + BufferVector buffers; + std::vector headers; + for (size_t i = 0; i < 40; ++i) { + buffers.push_back(Buffer::FromString(std::string(13, 'c'))); + headers.push_back(NotInline("cccc", 13, i, 0)); + } + EXPECT_THAT(Roundtrip(MakeArray(buffers, headers)), Ok()); + + EXPECT_THAT( + MakeArrayFromRawPointerViews({buffer_s, buffer_y}, + { + "not inlined, outside buffers", + }), + Raises(StatusCode::IndexError, + testing::HasSubstr("pointed outside the provided character buffers"))); } template diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index 07dc3014e40..97110ea97f3 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -126,6 +126,38 @@ TEST(TestArrayView, StringAsBinary) { CheckView(expected, arr); } +TEST(TestArrayView, StringViewAsBinaryView) { + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = ArrayFromJSON(utf8_view(), json); + auto expected = ArrayFromJSON(binary_view(), json); + CheckView(arr, expected); + CheckView(expected, arr); + } +} + +TEST(TestArrayView, StringViewAsBinaryViewInStruct) { + auto padl = ArrayFromJSON(list(int16()), "[[0, -1], [], [42]]"); + auto padr = ArrayFromJSON(utf8(), R"(["foox", "barz", null])"); + + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = + StructArray::Make({padl, ArrayFromJSON(utf8_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + auto expected = + StructArray::Make({padl, ArrayFromJSON(binary_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + + CheckView(arr, expected); + CheckView(expected, arr); + } +} + TEST(TestArrayView, PrimitiveWrongSize) { auto arr = ArrayFromJSON(int16(), "[0, -1, 42]"); CheckViewFails(arr, int8()); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index e0a7bc1193a..b5ed7187edb 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -42,6 +42,11 @@ using internal::checked_cast; // ---------------------------------------------------------------------- // Binary/StringView +BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : BinaryViewBuilder(pool) { + ARROW_CHECK(!checked_cast(*type).has_raw_pointers()); +} Status BinaryViewBuilder::AppendValues(const std::vector& values, const uint8_t* valid_bytes) { @@ -63,8 +68,8 @@ Status BinaryViewBuilder::AppendValues(const std::vector& values, } } } else { - for (std::size_t i = 0; i < values.size(); ++i) { - UnsafeAppend(values[i]); + for (const auto& value : values) { + UnsafeAppend(value); } } UnsafeAppendToBitmap(valid_bytes, values.size()); @@ -211,8 +216,8 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { std::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { const uint8_t* data_ptr = byte_builder_.data(); - return std::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); + return {reinterpret_cast(data_ptr + i * byte_width_), + static_cast(byte_width_)}; } // ---------------------------------------------------------------------- @@ -259,10 +264,10 @@ Status ChunkedStringBuilder::Finish(ArrayVector* out) { RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); // Change data type to string/utf8 - for (size_t i = 0; i < out->size(); ++i) { - std::shared_ptr data = (*out)[i]->data(); + for (auto& chunk : *out) { + std::shared_ptr data = chunk->data()->Copy(); data->type = ::arrow::utf8(); - (*out)[i] = std::make_shared(data); + chunk = std::make_shared(std::move(data)); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index d4f835ad1e6..e3053cae1de 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -204,10 +204,10 @@ class BaseBinaryBuilder } } } else { - for (std::size_t i = 0; i < values.size(); ++i) { + for (const auto& value : values) { UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend( - reinterpret_cast(values[i].data()), values[i].size()); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); } } @@ -466,10 +466,7 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { // ---------------------------------------------------------------------- // BinaryViewBuilder, StringViewBuilder // -// The builders permit two styles of use: one where appended data is -// accumulated in a third buffer that is appended to the resulting ArrayData, -// and one where only the StringHeaders are appended. If you only want to -// append StringHeaders, then use the Append(const StringHeader&) methods +// These builders do not support building raw pointer string view arrays. namespace internal { @@ -486,31 +483,34 @@ class ARROW_EXPORT StringHeapBuilder { public: static constexpr int64_t kDefaultBlocksize = 1 << 20; // 1MB - StringHeapBuilder(MemoryPool* pool, int64_t blocksize = kDefaultBlocksize) - : pool_(pool), blocksize_(blocksize) {} - - const uint8_t* UnsafeAppend(const uint8_t* data, int64_t num_bytes) { - memcpy(current_out_buffer_, data, static_cast(num_bytes)); - const uint8_t* result = current_out_buffer_; - current_out_buffer_ += num_bytes; - current_remaining_bytes_ -= num_bytes; - return result; - } + StringHeapBuilder(MemoryPool* pool, int64_t alignment, + int64_t blocksize = kDefaultBlocksize) + : pool_(pool), blocksize_(blocksize), alignment_(alignment) {} - Result Append(const uint8_t* data, int64_t num_bytes) { - if (num_bytes > current_remaining_bytes_) { - ARROW_RETURN_NOT_OK(Reserve(num_bytes)); - } - return UnsafeAppend(data, num_bytes); + void UnsafeAppend(StringHeader* raw_not_inlined) { + memcpy(current_out_buffer_, raw_not_inlined->GetRawPointer(), + raw_not_inlined->size()); + raw_not_inlined->SetIndexOffset(blocks_.size() - 1, current_offset_); + current_out_buffer_ += raw_not_inlined->size(); + current_remaining_bytes_ -= raw_not_inlined->size(); + current_offset_ += raw_not_inlined->size(); } /// \brief Ensure that the indicated number of bytes can be appended via /// UnsafeAppend operations without the need to allocate more memory Status Reserve(int64_t num_bytes) { if (num_bytes > current_remaining_bytes_) { + // Ensure the buffer is fully overwritten to avoid leaking uninitialized + // bytes from the allocator + if (current_remaining_bytes_ > 0) { + std::memset(current_out_buffer_, 0, current_remaining_bytes_); + blocks_.back() = SliceBuffer(blocks_.back(), 0, + blocks_.back()->size() - current_remaining_bytes_); + } current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, - AllocateBuffer(current_remaining_bytes_, pool_)); + AllocateBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; current_out_buffer_ = new_block->mutable_data(); blocks_.emplace_back(std::move(new_block)); } @@ -518,6 +518,7 @@ class ARROW_EXPORT StringHeapBuilder { } void Reset() { + current_offset_ = 0; current_out_buffer_ = nullptr; current_remaining_bytes_ = 0; blocks_.clear(); @@ -526,6 +527,7 @@ class ARROW_EXPORT StringHeapBuilder { int64_t current_remaining_bytes() const { return current_remaining_bytes_; } std::vector> Finish() { + current_offset_ = 0; current_out_buffer_ = nullptr; current_remaining_bytes_ = 0; return std::move(blocks_); @@ -534,8 +536,10 @@ class ARROW_EXPORT StringHeapBuilder { private: MemoryPool* pool_; const int64_t blocksize_; + int64_t alignment_; std::vector> blocks_; + size_t current_offset_ = 0; uint8_t* current_out_buffer_ = nullptr; int64_t current_remaining_bytes_ = 0; }; @@ -547,31 +551,28 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { using TypeClass = BinaryViewType; // this constructor provided for MakeBuilder compatibility - BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool) - : BinaryViewBuilder(pool) {} + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool, alignment), data_builder_(pool, alignment), - data_heap_builder_(pool) {} + data_heap_builder_(pool, alignment) {} int64_t current_block_bytes_remaining() const { return data_heap_builder_.current_remaining_bytes(); } Status Append(const uint8_t* value, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(1)); - if (length > static_cast(StringHeader::kInlineSize)) { - // String is stored out-of-line - if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { - return Status::CapacityError( - "BinaryView or StringView elements cannot reference " - "strings larger than 4GB"); - } - // Overwrite 'value' since we will use that for the StringHeader value below - ARROW_ASSIGN_OR_RAISE(value, data_heap_builder_.Append(value, length)); + if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 4GB"); } + if (!StringHeader::IsInline(length)) { + ARROW_RETURN_NOT_OK(ReserveData(length)); + } + ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(StringHeader(value, length)); return Status::OK(); } @@ -584,22 +585,11 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { return Append(value.data(), static_cast(value.size())); } - Status Append(StringHeader value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(value); - return Status::OK(); - } - /// \brief Append without checking capacity /// /// Builder should have been presized using Reserve() and ReserveData(), /// respectively, and the value must not be larger than 4GB void UnsafeAppend(const uint8_t* value, int64_t length) { - if (length > static_cast(StringHeader::kInlineSize)) { - // String is stored out-of-line - // Overwrite 'value' since we will use that for the StringHeader value below - value = data_heap_builder_.UnsafeAppend(value, length); - } UnsafeAppend(StringHeader(value, length)); } @@ -616,8 +606,12 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { } void UnsafeAppend(StringHeader value) { - data_builder_.UnsafeAppend(value); UnsafeAppendToBitmap(true); + if (!value.IsInline()) { + // String is stored out-of-line + data_heap_builder_.UnsafeAppend(&value); + } + data_builder_.UnsafeAppend(value); } /// \brief Ensures there is enough allocated available capacity in the @@ -627,7 +621,7 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(length, StringHeader()); // zero + data_builder_.UnsafeAppend(length, StringHeader{}); // zero UnsafeSetNull(length); return Status::OK(); } @@ -635,7 +629,7 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { ARROW_RETURN_NOT_OK(Reserve(1)); - data_builder_.UnsafeAppend(StringHeader()); // zero + data_builder_.UnsafeAppend(StringHeader{}); // zero UnsafeAppendToBitmap(false); return Status::OK(); } @@ -643,7 +637,7 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { /// \brief Append a empty element (length-0 inline string) Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(Reserve(1)); - data_builder_.UnsafeAppend(StringHeader("")); // zero + data_builder_.UnsafeAppend(StringHeader{}); // zero UnsafeAppendToBitmap(true); return Status::OK(); } @@ -651,18 +645,18 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { /// \brief Append several empty elements Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(length, StringHeader("")); + data_builder_.UnsafeAppend(length, StringHeader{}); // zero UnsafeSetNotNull(length); return Status::OK(); } void UnsafeAppendNull() { - data_builder_.UnsafeAppend(StringHeader()); + data_builder_.UnsafeAppend(StringHeader{}); // zero UnsafeAppendToBitmap(false); } void UnsafeAppendEmptyValue() { - data_builder_.UnsafeAppend(StringHeader()); + data_builder_.UnsafeAppend(StringHeader{}); // zero UnsafeAppendToBitmap(true); } @@ -746,7 +740,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const Buffer& s) { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(std::string_view(s)); + UnsafeAppend(s); return Status::OK(); } @@ -797,7 +791,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } - void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index a5175eb31cc..6e863490480 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -229,30 +229,41 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } - Status Visit(const BinaryViewType&) { - bool any_opted_out_of_view_validation = false; + Status Visit(const BinaryViewType& type) { out_->buffers.resize(2); for (const auto& in_data : in_) { auto begin = in_data->buffers.begin() + 2; auto end = in_data->buffers.end(); - if (BinaryViewArray::OptedOutOfViewValidation(*in_data)) { - any_opted_out_of_view_validation = true; - --end; - } - for (auto it = begin; it != end; ++it) { out_->buffers.push_back(*it); } } - if (any_opted_out_of_view_validation) { - out_->buffers = BinaryViewArray::DoNotValidateViews(std::move(out_->buffers)); + ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader))); + ARROW_ASSIGN_OR_RAISE(auto header_buffer, ConcatenateBuffers(header_buffers, pool_)); + + if (!type.has_raw_pointers()) { + auto* s = header_buffer->mutable_data_as(); + + size_t preceding_buffer_count = 0; + + int64_t i = in_[0]->length; + for (size_t in_index = 1; in_index < in_.size(); ++in_index) { + preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2; + + for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) { + if (s[i].IsInline()) continue; + auto buffer_index = + static_cast(s[i].GetBufferIndex() + preceding_buffer_count); + s[i].SetIndexOffset(buffer_index, s[i].GetBufferOffset()); + } + } } - ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader))); - return ConcatenateBuffers(header_buffers, pool_).Value(&out_->buffers[1]); + out_->buffers[1] = std::move(header_buffer); + return Status::OK(); } Status Visit(const ListType&) { @@ -631,6 +642,10 @@ Result> Concatenate(const ArrayVector& arrays, MemoryPool return Status::Invalid("Must pass at least one array"); } + if (arrays.size() == 1) { + return arrays[0]; + } + // gather ArrayData of input arrays ArrayDataVector data(arrays.size()); for (size_t i = 0; i < arrays.size(); ++i) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index c074db8a886..d9b0b7d235b 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -94,7 +94,12 @@ class ConcatenateTest : public ::testing::Test { factory(size, null_probability, &array); ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + ASSERT_OK(expected->ValidateFull()); auto slices = this->Slices(array, offsets); + for (auto slice : slices) { + ASSERT_OK(slice->ValidateFull()); + } + ASSERT_OK(expected->ValidateFull()); ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { @@ -161,6 +166,16 @@ TEST_F(ConcatenateTest, StringViewType) { *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); ASSERT_OK((**out).ValidateFull()); }); + + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability); + const ArrayData& io = *(*out)->data(); + auto raw_buf = AllocateBuffer(io.buffers[1]->size()).ValueOrDie(); + ABORT_NOT_OK( + internal::SwapStringHeaderPointers(io, raw_buf->mutable_data_as())); + (*out)->data()->buffers[1] = std::move(raw_buf); + (*out)->data()->type = utf8_view(/*has_raw_pointers=*/true); + }); } TEST_F(ConcatenateTest, LargeStringType) { diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index ec42e90b9a3..30b558b8d27 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -199,7 +199,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { Type::type type_id = this->type->id(); if (type_id == Type::EXTENSION) { - const ExtensionType* ext_type = checked_cast(this->type); + auto* ext_type = checked_cast(this->type); type_id = ext_type->storage_type()->id(); } @@ -214,6 +214,14 @@ void ArraySpan::SetMembers(const ArrayData& data) { this->buffers[i] = {}; } + if (type_id == Type::STRING_VIEW || type_id == Type::BINARY_VIEW) { + // store the span of character buffers in the third buffer + this->buffers[2].data = + const_cast(reinterpret_cast(data.buffers.data() + 2)); + this->buffers[2].size = + static_cast(data.buffers.size() - 2) * sizeof(std::shared_ptr); + } + if (type_id == Type::DICTIONARY) { this->child_data.resize(1); this->child_data[0].SetMembers(*data.dictionary); @@ -248,6 +256,8 @@ int GetNumBuffers(const DataType& type) { case Type::LARGE_BINARY: case Type::STRING: case Type::LARGE_STRING: + case Type::STRING_VIEW: + case Type::BINARY_VIEW: case Type::DENSE_UNION: return 3; case Type::EXTENSION: @@ -330,12 +340,12 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } else if (is_base_binary_like(type_id)) { const auto& scalar = checked_cast(value); this->buffers[1].data = reinterpret_cast(this->scratch_space); - const uint8_t* data_buffer = nullptr; - int64_t data_size = 0; - if (scalar.is_valid) { - data_buffer = scalar.value->data(); - data_size = scalar.value->size(); - } + static auto kEmptyBuffer = Buffer::FromString(""); + const auto& value = scalar.is_valid ? scalar.value : kEmptyBuffer; + this->buffers[2].data = const_cast(value->data()); + this->buffers[2].size = value->size(); + this->buffers[2].owner = &value; + int64_t data_size = this->buffers[2].size; if (is_binary_like(type_id)) { SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), data_size); @@ -344,18 +354,24 @@ void ArraySpan::FillFromScalar(const Scalar& value) { SetOffsetsForScalar(this, reinterpret_cast(this->scratch_space), data_size); } - this->buffers[2].data = const_cast(data_buffer); - this->buffers[2].size = data_size; } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { const auto& scalar = checked_cast(value); + this->buffers[1].data = reinterpret_cast(this->scratch_space); if (scalar.is_valid) { - *reinterpret_cast(this->buffers[1].data) = {scalar.value->data(), - scalar.value->size()}; - this->buffers[2].data = const_cast(scalar.value->data()); - this->buffers[2].size = scalar.value->size(); + if (checked_cast(type)->has_raw_pointers()) { + new (this->scratch_space) StringHeader{scalar.value->data_as(), + static_cast(scalar.value->size())}; + } else { + new (this->scratch_space) StringHeader{scalar.value->data_as(), + static_cast(scalar.value->size()), + 0, scalar.value->data_as()}; + } + this->buffers[2].data = + const_cast(reinterpret_cast(&scalar.value)); + this->buffers[2].size = sizeof(std::shared_ptr); } else { - *reinterpret_cast(this->buffers[1].data) = {}; + new (this->scratch_space) StringHeader{}; } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); @@ -698,7 +714,8 @@ struct ViewDataImpl { } RETURN_NOT_OK(CheckInputAvailable()); - const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx]; + const auto& in_layout = in_layouts[in_layout_idx]; + const auto& in_spec = in_layout.buffers[in_buffer_idx]; if (out_spec != in_spec) { return InvalidView("incompatible layouts"); } @@ -709,6 +726,18 @@ struct ViewDataImpl { DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx); out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); ++in_buffer_idx; + + if (in_buffer_idx == in_layout.buffers.size()) { + if (out_layout.variadic_spec != in_layout.variadic_spec) { + return InvalidView("incompatible layouts"); + } + + if (in_layout.variadic_spec) { + for (; in_buffer_idx < in_data_item->buffers.size(); ++in_buffer_idx) { + out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); + } + } + } AdjustInputPointer(); } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 6294f65a858..5909f6dd387 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -384,6 +384,7 @@ struct ARROW_EXPORT ArraySpan { ArraySpan(const ArrayData& data) { // NOLINT implicit conversion SetMembers(data); } + /// Warning: this produces an ArraySpan which cannot be safely moved/copied! explicit ArraySpan(const Scalar& data) { FillFromScalar(data); } /// If dictionary-encoded, put dictionary in the first entry @@ -391,6 +392,7 @@ struct ARROW_EXPORT ArraySpan { /// \brief Populate ArraySpan to look like an array of length 1 pointing at /// the data members of a Scalar value + /// Warning: this produces an ArraySpan which cannot be safely moved/copied! void FillFromScalar(const Scalar& value); void SetMembers(const ArrayData& data); @@ -472,6 +474,8 @@ struct ARROW_EXPORT ArraySpan { this->null_count = this->length; } else if (this->MayHaveNulls()) { this->null_count = kUnknownNullCount; + } else { + this->null_count = 0; } } diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 5245c8d0ff3..1b9e8162e4a 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -156,6 +156,60 @@ struct DictionaryTraits> { } }; +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static_assert(std::is_same_v>); + + static Status GetDictionaryArrayData(MemoryPool* pool, + const std::shared_ptr& type, + const MemoTableType& memo_table, + int64_t start_offset, + std::shared_ptr* out) { + DCHECK(type->id() == Type::STRING_VIEW || type->id() == Type::BINARY_VIEW); + + auto dict_length = static_cast(memo_table.size() - start_offset); + + // Create an offsets buffer + // TODO(bkietz) this could be skipped with a custom memo table for StringView + ARROW_ASSIGN_OR_RAISE(auto dict_offsets, + AllocateBuffer(sizeof(int32_t) * (dict_length + 1), pool)); + auto* offsets = reinterpret_cast(dict_offsets->mutable_data()); + memo_table.CopyOffsets(static_cast(start_offset), offsets); + + // Create the data buffer + auto values_size = memo_table.values_size(); + ARROW_ASSIGN_OR_RAISE(auto dict_data, AllocateBuffer(values_size, pool)); + if (values_size > 0) { + memo_table.CopyValues(static_cast(start_offset), dict_data->size(), + dict_data->mutable_data()); + } + auto* data = dict_data->template data_as(); + + ARROW_ASSIGN_OR_RAISE(auto dict_headers, + AllocateBuffer(sizeof(StringHeader) * dict_length, pool)); + auto* headers = dict_headers->mutable_data_as(); + for (int64_t i = 0; i < dict_length; ++i) { + auto size = static_cast(offsets[i + 1] - offsets[i]); + auto offset = static_cast(offsets[i]); + new (headers++) StringHeader{data + offset, size, 0, data}; + } + + int64_t null_count = 0; + std::shared_ptr null_bitmap = nullptr; + RETURN_NOT_OK( + ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap)); + + *out = ArrayData::Make( + type, dict_length, + {std::move(null_bitmap), std::move(dict_headers), std::move(dict_data)}, + null_count); + + return Status::OK(); + } +}; + template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 191c039d021..ad8155a4a90 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -43,6 +43,9 @@ #include "arrow/util/decimal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" +#include "arrow/util/span.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -267,12 +270,45 @@ class ArrayDataEndianSwapper { return Status::OK(); } - template - enable_if_t::value || - std::is_same::value, - Status> - Visit(const T& type) { - return Status::NotImplemented("Binary / string view"); + Status Visit(const BinaryViewType& type) { + if (type.has_raw_pointers()) { + return Status::NotImplemented( + "Swapping endianness of binary / string view with raw pointers"); + } + + auto* s = data_->buffers[1]->data_as(); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto* new_s = new_buffer->mutable_data_as(); + + // NOTE: data_->length not trusted (see warning above) + const int64_t length = data_->buffers[1]->size() / sizeof(StringHeader); + + for (int64_t i = 0; i < length; i++) { + uint32_t size = s[i].size(); +#if ARROW_LITTLE_ENDIAN + size = bit_util::FromBigEndian(size); +#else + size = bit_util::FromLittleEndian(size); +#endif + if (StringHeader::IsInline(size)) { + new_s[i] = s[i]; + std::memcpy(static_cast(&new_s[i]), &size, sizeof(uint32_t)); + continue; + } + + uint32_t buffer_index = s[i].GetBufferIndex(); + uint32_t offset = s[i].GetBufferOffset(); +#if ARROW_LITTLE_ENDIAN + buffer_index = bit_util::FromBigEndian(buffer_index); + offset = bit_util::FromBigEndian(offset); +#else + buffer_index = bit_util::FromLittleEndian(buffer_index); + offset = bit_util::FromLittleEndian(offset); +#endif + new_s[i] = StringHeader{size, s[i].GetPrefix(), buffer_index, offset}; + } + out_->buffers[1] = std::move(new_buffer); + return Status::OK(); } Status Visit(const ListType& type) { @@ -594,7 +630,7 @@ class NullArrayFactory { } MemoryPool* pool_; - std::shared_ptr type_; + const std::shared_ptr& type_; int64_t length_; std::shared_ptr out_; std::shared_ptr buffer_; @@ -969,5 +1005,221 @@ std::vector RechunkArraysConsistently( return rechunked_groups; } +namespace { +Status FromRawPointerStringHeaders(const ArraySpan& raw, + util::span> char_buffers, + StringHeader* io) { + DCHECK_NE(char_buffers.size(), 0); + + auto IsInBuffer = [](const Buffer& buffer, StringHeader s) { + return buffer.data_as() <= s.data() && + buffer.data_as() + buffer.size() >= s.data() + s.size(); + }; + + auto Write = [&](auto find_containing_buffer) { + // Given `find_containing_buffer` which looks up the index of a buffer containing + // a StringHeader, write an equivalent buffer of index/offset string views. + static const Buffer kEmptyBuffer{""}; + const Buffer* buffer_containing_previous_view = &kEmptyBuffer; + uint32_t buffer_index; + + auto* raw_ptr = raw.GetValues(1); + + bool all_valid = true; + VisitNullBitmapInline( + raw.buffers[0].data, raw.offset, raw.length, raw.null_count, + [&] { + // Copied to a local variable, so even if io == raw_ptr + // we can modify safely. + auto s = *raw_ptr++; + + if (!s.IsInline()) { + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + if (ARROW_PREDICT_FALSE(!IsInBuffer(*buffer_containing_previous_view, s))) { + auto found = find_containing_buffer(s); + if (ARROW_PREDICT_FALSE(!found)) { + all_valid = false; + return; + } + // Assume that we're at the start of a run of views into + // char_buffers[buffer_index]; adjust the fast path's pointer accordingly + buffer_index = *found; + buffer_containing_previous_view = char_buffers[buffer_index].get(); + } + + s.SetIndexOffset( + buffer_index, + s.data() - char_buffers[buffer_index]->template data_as()); + } + *io++ = s; + }, + [&] { + ++raw_ptr; + *io++ = {}; + }); + + if (!all_valid) { + return Status::IndexError( + "A header pointed outside the provided character buffers"); + } + return Status::OK(); + }; + + auto LinearSearch = [&](StringHeader s) -> std::optional { + uint32_t buffer_index = 0; + for (const auto& char_buffer : char_buffers) { + if (IsInBuffer(*char_buffer, s)) return buffer_index; + ++buffer_index; + } + return {}; + }; + + if (char_buffers.size() <= 32) { + // If there are few buffers to search through, sorting/binary search is not + // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. + return Write(LinearSearch); + } + + auto sort_indices = ArgSort( + char_buffers, [](const auto& l, const auto& r) { return l->data() < r->data(); }); + + auto first_overlapping = std::adjacent_find( + sort_indices.begin(), sort_indices.end(), [&](int64_t before, int64_t after) { + return char_buffers[before]->data() + char_buffers[before]->size() <= + char_buffers[after]->data(); + }); + if (ARROW_PREDICT_FALSE(first_overlapping != sort_indices.end())) { + // Using a binary search with overlapping buffers would not *uniquely* identify + // a potentially-containing buffer. Moreover this should be a fairly rare case + // so optimizing for it seems premature. + return Write(LinearSearch); + } + + auto BinarySearch = [&](StringHeader s) -> std::optional { + // Find the first buffer whose data starts after the data in view- + // only buffers *before* this could contain view. Since we've additionally + // checked that the buffers do not overlap, only the buffer *immediately before* + // this could contain view. + auto one_past_potential_super = + std::upper_bound(sort_indices.begin(), sort_indices.end(), s, + [&](const StringHeader& s, int64_t i) { + return IsInBuffer(*char_buffers[i], s); + }); + + if (ARROW_PREDICT_FALSE(one_past_potential_super == sort_indices.begin())) { + return {}; + } + + auto buffer_index = *(one_past_potential_super - 1); + const auto& char_buffer = *char_buffers[buffer_index]; + if (ARROW_PREDICT_TRUE(IsInBuffer(char_buffer, s))) return buffer_index; + + return {}; + }; + + return Write(BinarySearch); +} + +Status ToRawPointerStringHeaders(const ArraySpan& io, + util::span> char_buffers, + StringHeader* raw) { + DCHECK_NE(char_buffers.size(), 0); + + uint32_t buffer_index = 0; + const char* buffer_data = char_buffers[0]->data_as(); + auto* io_ptr = io.GetValues(1); + + bool all_valid = true; + VisitNullBitmapInline( + io.buffers[0].data, io.offset, io.length, io.null_count, + [&] { + // Copied to a local variable, so even if raw == io_ptr + // we can modify safely. + auto s = *io_ptr++; + + if (!s.IsInline()) { + // Fast path: for most string view arrays, we'll have runs + // of views into the same buffer. + if (ARROW_PREDICT_FALSE(s.GetBufferIndex() != buffer_index)) { + if (ARROW_PREDICT_FALSE(s.GetBufferIndex() >= char_buffers.size())) { + all_valid = false; + return; + } + // Assume that we're at the start of a run of views into + // char_buffers[buffer_index]; adjust the fast path's pointer accordingly + buffer_index = s.GetBufferIndex(); + buffer_data = char_buffers[buffer_index]->data_as(); + } + s.SetRawPointer(buffer_data + s.GetBufferOffset()); + } + *raw++ = s; + }, + [&] { + ++io_ptr; + *raw++ = {}; + }); + + if (!all_valid) { + return Status::IndexError("A header pointed outside the provided character buffers"); + } + return Status::OK(); +} +} // namespace + +Status SwapStringHeaderPointers(const ArraySpan& in, StringHeader* out) { + util::span char_buffers{ + reinterpret_cast*>(in.buffers[2].data), + static_cast(in.buffers[2].size / sizeof(std::shared_ptr))}; + + if (char_buffers.size() == 0) { + // If there are no character buffers, then all string views must be inline. + // In this case the buffer does not require swizzling between pointers and + // index/offsets. + auto* in_ptr = in.GetValues(1); + + bool all_inline = true; + VisitNullBitmapInline( + in.buffers[0].data, in.offset, in.length, in.null_count, + [&] { + all_inline = all_inline && in_ptr->IsInline(); + auto s = *in_ptr++; + *out++ = s; + }, + [&] { + ++in_ptr; + *out++ = {}; + }); + if (ARROW_PREDICT_FALSE(!all_inline)) { + return Status::IndexError( + "A header was not inline when no character buffers were provided"); + } + return Status::OK(); + } + + return checked_cast(in.type)->has_raw_pointers() + ? FromRawPointerStringHeaders(in, char_buffers, out) + : ToRawPointerStringHeaders(in, char_buffers, out); +} + +void StringHeadersFromStrings(const ArraySpan& strings, StringHeader* s) { + auto* buffer_data = reinterpret_cast(strings.buffers[2].data); + VisitArraySpanInline( + strings, + [&](std::string_view v) { + *s++ = StringHeader{v.data(), static_cast(v.size()), 0, buffer_data}; + }, + [&] { *s++ = StringHeader{}; }); +} + +void RawPointerStringHeadersFromStrings(const ArraySpan& strings, StringHeader* s) { + VisitArraySpanInline( + strings, + [&](std::string_view v) { + *s++ = StringHeader{v.data(), static_cast(v.size())}; + }, + [&] { *s++ = StringHeader{}; }); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index 6e6c61bd03d..adae067db0c 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -85,5 +85,26 @@ Result> SwapEndianArrayData( ARROW_EXPORT std::vector RechunkArraysConsistently(const std::vector&); +/// Convert between index/offset and raw pointer StringHeaders. +/// +/// This function can be used to overwrite a buffer of StringHeader if desired, +/// IE it is supported for `in.buffers[1].data == out`. +/// +/// Note that calling this function is not necessary if all StringHeaders happen to be +/// Inline; this is usually efficiently detectable by checking for an absence of any +/// character buffers. +/// +/// Will raise IndexError if a header views memory outside the provided character buffers. +ARROW_EXPORT +Status SwapStringHeaderPointers(const ArraySpan& in, StringHeader* out); + +/// Fill a buffer of index/offset StringHeader from a dense string array +ARROW_EXPORT +void StringHeadersFromStrings(const ArraySpan& strings, StringHeader* io); + +/// Fill a buffer of raw pointer StringHeader from a dense string array +ARROW_EXPORT +void RawPointerStringHeadersFromStrings(const ArraySpan& strings, StringHeader* raw); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 4d80c4d92e1..0e85add30b4 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -33,6 +33,7 @@ #include "arrow/util/ree_util.h" #include "arrow/util/sort.h" #include "arrow/util/string.h" +#include "arrow/util/string_header.h" #include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" @@ -40,133 +41,6 @@ namespace arrow::internal { -/// visitor will be called once for each non-inlined StringHeader. -/// It will be passed the index of each non-inlined StringHeader, -/// as well as a `const shared_ptr&` of the buffer -/// wherein the viewed memory resides, or nullptr if the viewed memory -/// is not in a buffer managed by this array. -template -Status VisitNonInlinedViewsAndOwningBuffers(const ArrayData& data, - const Visitor& visitor) { - auto* headers = data.buffers[1]->data_as(); - - static const std::shared_ptr kNullBuffer = nullptr; - - if (data.buffers.size() == 2 || - (data.buffers.size() == 3 && data.buffers.back() == nullptr)) { - // there are no character buffers, just visit a null buffer - for (int64_t i = 0; i < data.length; ++i) { - if (headers[i].IsInline()) continue; - RETURN_NOT_OK(visitor(i, kNullBuffer)); - } - return Status::OK(); - } - - auto IsSubrangeOf = [](std::string_view super, StringHeader sub) { - return super.data() <= sub.data() && - super.data() + super.size() >= sub.data() + sub.size(); - }; - - std::vector buffers; - std::vector*> owning_buffers; - for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) { - if (*it != nullptr) { - buffers.emplace_back(**it); - owning_buffers.push_back(&*it); - } - } - - const int not_found = static_cast(buffers.size()); - - auto DoVisit = [&](auto get_buffer) { - DCHECK(!buffers.empty()); - - // owning_buffers[not_found] points to the null placeholder - owning_buffers.push_back(&kNullBuffer); - - std::string_view buffer_containing_previous_view = buffers.front(); - int buffer_i = 0; - - for (int64_t i = 0; i < data.length; ++i) { - if (headers[i].IsInline()) continue; - - if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, headers[i]))) { - // Fast path: for most string view arrays, we'll have runs - // of views into the same buffer. - } else { - buffer_i = get_buffer(headers[i]); - if (buffer_i != not_found) { - // if we didn't find a buffer which owns headers[i], we can hope - // that there was just one out of line string and check - // buffer_containing_previous_view next iteration - buffer_containing_previous_view = buffers[buffer_i]; - } - } - - RETURN_NOT_OK(visitor(i, *owning_buffers[buffer_i])); - } - return Status::OK(); - }; - - // Simplest check for view-in-buffer: loop through buffers and check each one. - auto Linear = [&](StringHeader view) { - int i = 0; - for (std::string_view buffer : buffers) { - if (IsSubrangeOf(buffer, view)) return i; - ++i; - } - return not_found; - }; - - if (buffers.size() <= 32) { - // If there are few buffers to search through, sorting/binary search is not - // worthwhile. TODO(bkietz) benchmark this and get a less magic number here. - return DoVisit(Linear); - } - - auto DataPtrLess = [](std::string_view l, std::string_view r) { - return l.data() < r.data(); - }; - - { - auto sort_indices = ArgSort(buffers, DataPtrLess); - Permute(sort_indices, &buffers); - Permute(sort_indices, &owning_buffers); - } - - bool non_overlapping = - buffers.end() != - std::adjacent_find(buffers.begin(), buffers.end(), - [](std::string_view before, std::string_view after) { - return before.data() + before.size() <= after.data(); - }); - if (ARROW_PREDICT_FALSE(!non_overlapping)) { - // Using a binary search with overlapping buffers would not *uniquely* identify - // a potentially-containing buffer. Moreover this should be a fairly rare case - // so optimizing for it seems premature. - return DoVisit(Linear); - } - - // More sophisticated check for view-in-buffer: binary search through the buffers. - return DoVisit([&](StringHeader view) { - // Find the first buffer whose data starts after the data in view- - // only buffers *before* this could contain view. Since we've additionally - // checked that the buffers do not overlap, only the buffer *immediately before* - // this could contain view. - int one_past_potential_super = - static_cast(std::upper_bound(buffers.begin(), buffers.end(), - std::string_view{view}, DataPtrLess) - - buffers.begin()); - - if (one_past_potential_super == 0) return not_found; - - int i = one_past_potential_super - 1; - if (IsSubrangeOf(buffers[i], view)) return i; - - return not_found; - }); -} - namespace { struct UTF8DataValidator { @@ -750,20 +624,49 @@ struct ValidateArrayImpl { " and offset: ", data.offset); } - if (!full_validation || BinaryViewArray::OptedOutOfViewValidation(data)) { + if (!full_validation) { return Status::OK(); } - return VisitNonInlinedViewsAndOwningBuffers( - data, [&](int64_t i, const std::shared_ptr& owner) { - if (ARROW_PREDICT_TRUE(owner != nullptr)) return Status::OK(); + if (type.has_raw_pointers()) { + // TODO(bkietz) validate as with conversions? + return Status::OK(); + } - auto* ptr = data.buffers[1]->data_as()[i].data(); - return Status::Invalid( - "String view at slot ", i, " @", - arrow::HexEncode(reinterpret_cast(&ptr), sizeof(ptr)), - " views memory not resident in any buffer managed by the array"); - }); + auto* s = data.GetValues(1); + for (int64_t i = 0; i < data.length; ++i, ++s) { + if (data.IsNull(i)) continue; + + if (s->IsInline()) continue; + + size_t buffer_index = s->GetBufferIndex(); + if (ARROW_PREDICT_FALSE(buffer_index + 2 >= data.buffers.size())) { + return Status::IndexError("String view at slot ", i, " references buffer ", + buffer_index, " but there are only ", + data.buffers.size() - 2, " character buffers"); + } + + size_t begin = s->GetBufferOffset(); + size_t end = begin + s->size(); + const auto& buffer = data.buffers[buffer_index + 2]; + auto size = static_cast(buffer->size()); + if (ARROW_PREDICT_FALSE(end > size)) { + return Status::IndexError("String view at slot ", i, " references range ", begin, + "-", end, " of buffer ", buffer_index, + " but that buffer is only ", size, " bytes long"); + } + + const char* data = buffer->data_as() + begin; + if (ARROW_PREDICT_FALSE( + std::memcmp(data, s->GetInlineData(), StringHeader::kPrefixSize) != 0)) { + return Status::Invalid("String view at slot ", i, " has inlined prefix 0x", + HexEncode(s->GetInlineData(), StringHeader::kPrefixSize), + " but the out-of-line character data begins with 0x", + HexEncode(data, StringHeader::kPrefixSize)); + } + } + + return Status::OK(); } template diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index caddbf9db55..a1bc398cca4 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -148,6 +148,8 @@ struct DictionaryBuilderCase { Status Visit(const StringType&) { return CreateFor(); } Status Visit(const LargeBinaryType&) { return CreateFor(); } Status Visit(const LargeStringType&) { return CreateFor(); } + Status Visit(const BinaryViewType&) { return CreateFor(); } + Status Visit(const StringViewType&) { return CreateFor(); } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } Status Visit(const Decimal128Type&) { return CreateFor(); } Status Visit(const Decimal256Type&) { return CreateFor(); } @@ -162,6 +164,11 @@ struct DictionaryBuilderCase { template Status CreateFor() { + if constexpr (is_binary_view_like_type::value) { + if (checked_cast(*value_type).has_raw_pointers()) { + return NotImplemented(*value_type); + } + } using AdaptiveBuilderType = DictionaryBuilder; if (dictionary != nullptr) { out->reset(new AdaptiveBuilderType(dictionary, pool)); @@ -190,7 +197,12 @@ struct DictionaryBuilderCase { struct MakeBuilderImpl { template - enable_if_not_nested Visit(const T&) { + enable_if_not_nested Visit(const T& t) { + if constexpr (is_binary_view_like_type::value) { + if (t.has_raw_pointers()) { + return NotImplemented(); + } + } out.reset(new typename TypeTraits::BuilderType(type, pool)); return Status::OK(); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index dae1fc6f97a..c318a54b130 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -265,9 +265,28 @@ class RangeDataEqualsImpl { Status Visit(const BinaryViewType& type) { auto* left_values = left_.GetValues(1) + left_start_idx_; auto* right_values = right_.GetValues(1) + right_start_idx_; + if (type.has_raw_pointers()) { + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (left_values[i] != right_values[i]) { + return false; + } + } + return true; + }); + return Status::OK(); + } + + auto* left_buffers = left_.buffers.data() + 2; + auto* right_buffers = right_.buffers.data() + 2; VisitValidRuns([&](int64_t i, int64_t length) { - return std::equal(left_values + i, left_values + i + length, - right_values + i, right_values + i + length); + for (auto end_i = i + length; i < end_i; ++i) { + if (!left_values[i].EqualsIndexOffset(left_buffers, right_values[i], + right_buffers)) { + return false; + } + } + return true; }); return Status::OK(); } @@ -636,13 +655,19 @@ class TypeEqualsVisitor { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value || is_binary_view_like_type::value, + is_base_binary_type::value, Status> Visit(const T&) { result_ = true; return Status::OK(); } + Status Visit(const BinaryViewType& left) { + const auto& right = checked_cast(right_); + result_ = left.has_raw_pointers() == right.has_raw_pointers(); + return Status::OK(); + } + template enable_if_interval Visit(const T& left) { const auto& right = checked_cast(right_); @@ -813,8 +838,7 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } template - enable_if_t::value, Status> - Visit(const T& left) { + enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 5c4f69bee32..e70ed5f38a9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -28,6 +28,7 @@ #include "arrow/util/cpu_info.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" +#include "arrow/util/span.h" #include "arrow/util/unreachable.h" #include "arrow/util/utf8_internal.h" #include "arrow/visit_data_inline.h" @@ -38,8 +39,7 @@ using internal::StringFormatter; using util::InitializeUTF8; using util::ValidateUTF8Inline; -namespace compute { -namespace internal { +namespace compute::internal { namespace { @@ -55,7 +55,7 @@ struct NumericToStringCastFunctor { static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& input = batch[0].array; FormatterType formatter(input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); RETURN_NOT_OK(VisitArraySpanInline( input, [&](value_type v) { @@ -79,7 +79,7 @@ struct DecimalToStringCastFunctor { static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& input = batch[0].array; FormatterType formatter(input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); RETURN_NOT_OK(VisitArraySpanInline( input, [&](std::string_view bytes) { @@ -107,7 +107,7 @@ struct TemporalToStringCastFunctor { static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& input = batch[0].array; FormatterType formatter(input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); RETURN_NOT_OK(VisitArraySpanInline( input, [&](value_type v) { @@ -132,7 +132,7 @@ struct TemporalToStringCastFunctor { const ArraySpan& input = batch[0].array; const auto& timezone = GetInputTimezone(*input.type); const auto& ty = checked_cast(*input.type); - BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool()); + BuilderType builder(TypeTraits::type_singleton(), ctx->memory_pool()); // Preallocate int64_t string_length = 19; // YYYY-MM-DD HH:MM:SS @@ -337,7 +337,6 @@ Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, } if constexpr (kInputViews && kOutputOffsets) { - // copy the input's null bitmap if necessary if (input.MayHaveNulls()) { ARROW_ASSIGN_OR_RAISE( output->buffers[0], @@ -349,76 +348,61 @@ Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, using offset_type = typename O::offset_type; - auto* offsets = output->buffers[1]->mutable_data_as(); - offsets[0] = 0; - auto AppendOffset = [&](size_t size) mutable { - offsets[1] = offsets[0] + static_cast(size); - ++offsets; - }; + auto* offset = output->buffers[1]->mutable_data_as(); + offset[0] = 0; - // TODO(bkietz) if ArraySpan::buffers were a SmallVector, we could have access to all - // the character data buffers here and reserve character data accordingly. + util::span char_buffers{ + reinterpret_cast*>(input.buffers[2].data), + static_cast(input.buffers[2].size / sizeof(std::shared_ptr))}; + int64_t char_count = 0; + for (const auto& buf : char_buffers) { + char_count += buf->size(); + } BufferBuilder char_builder{ctx->memory_pool()}; + RETURN_NOT_OK(char_builder.Reserve(char_count)); - // sweep through L1-sized chunks to reduce the frequency of allocation and overflow - // checking - int64_t chunk_size = ctx->exec_context()->cpu_info()->CacheSize( - ::arrow::internal::CpuInfo::CacheLevel::L1) / - sizeof(StringHeader) / 4; - - RETURN_NOT_OK(::arrow::internal::VisitSlices( - input, chunk_size, [&](const ArraySpan& input_slice) { - size_t num_appended_chars = 0; - int64_t num_chars = char_builder.length(); - VisitArraySpanInline( - input_slice, [&](std::string_view v) { num_appended_chars += v.size(); }, - [] {}); - + RETURN_NOT_OK(VisitArraySpanInline( + input, + [&](std::string_view v) { if constexpr (std::is_same_v) { - if (ARROW_PREDICT_FALSE(char_builder.length() + num_appended_chars > + if (ARROW_PREDICT_FALSE(char_builder.length() + v.size() > std::numeric_limits::max())) { return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", out->type()->ToString(), ": input array viewed too many characters"); } } - RETURN_NOT_OK(char_builder.Reserve(static_cast(num_appended_chars))); - - VisitArraySpanInline( - input_slice, - [&](std::string_view v) { - char_builder.UnsafeAppend(v); - AppendOffset(v.size()); - }, - [&] { AppendOffset(0); }); - - if (check_utf8) { - if (ARROW_PREDICT_FALSE( - !ValidateUTF8Inline(char_builder.data() + num_chars, - static_cast(num_appended_chars)))) { - return Status::Invalid("Invalid UTF8 sequence"); - } - } + offset[1] = static_cast(offset[0] + v.size()); + ++offset; + return char_builder.Append(v); + }, + [&] { + offset[1] = offset[0]; + ++offset; return Status::OK(); })); + RETURN_NOT_OK(SimpleUtf8Validation()); return char_builder.Finish(&output->buffers[2]); } if constexpr ((kInputOffsets || kInputFixed) && kOutputViews) { - // FIXME(bkietz) when outputting views, we *could* output into slices, + // TODO(bkietz) when outputting views, we *could* output into slices, // provided we have a threadsafe place to stash accumulated buffers // of character data. - if (input.MayHaveNulls()) { ARROW_ASSIGN_OR_RAISE( output->buffers[0], arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, input.offset, input.length)); + } else { + output->buffers[0] = nullptr; } - // FIXME(bkietz) segfault due to null buffer owner - // output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2); + // Borrow the input's character buffer + output->buffers.resize(3); + output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2); + auto* buffer_data = output->buffers[2]->data_as(); auto* headers = output->buffers[1]->mutable_data_as(); if (check_utf8) { @@ -426,7 +410,8 @@ Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, return VisitArraySpanInline( input, [&](std::string_view v) { - *headers++ = StringHeader{v}; + new (headers++) + StringHeader{v.data(), static_cast(v.size()), 0, buffer_data}; return validator.VisitValue(v); }, [&] { @@ -435,7 +420,11 @@ Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, }); } else { VisitArraySpanInline( - input, [&](std::string_view v) { *headers++ = StringHeader{v}; }, + input, + [&](std::string_view v) { + new (headers++) + StringHeader{v.data(), static_cast(v.size()), 0, buffer_data}; + }, [&] { *headers++ = StringHeader{}; }); return Status::OK(); } @@ -627,6 +616,5 @@ std::vector> GetBinaryLikeCasts() { }; } -} // namespace internal -} // namespace compute +} // namespace compute::internal } // namespace arrow diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index e1d4282cb26..0b6ae4f6206 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -264,7 +264,8 @@ TEST_P(TestFeather, TimeTypes) { TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckRoundtrip(batch); } @@ -306,7 +307,8 @@ TEST_P(TestFeather, SliceFloatRoundTrip) { TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckSlices(batch); } diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 6eee5955242..11d296f24f5 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -327,6 +327,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestString, TestStrings, StringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestBinary, TestStrings, BinaryType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeString, TestStrings, LargeStringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinary, TestStrings, LargeBinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringView, TestStrings, StringViewType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryView, TestStrings, BinaryViewType); TEST(TestNull, Basics) { std::shared_ptr type = null(); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 46b163b829f..23accc2390f 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -258,6 +258,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeBinary: *out = large_binary(); return Status::OK(); + case flatbuf::Type::BinaryView: + *out = binary_view(); + return Status::OK(); case flatbuf::Type::FixedSizeBinary: { auto fw_binary = static_cast(type_data); return FixedSizeBinaryType::Make(fw_binary->byteWidth()).Value(out); @@ -268,6 +271,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeUtf8: *out = large_utf8(); return Status::OK(); + case flatbuf::Type::Utf8View: + *out = utf8_view(); + return Status::OK(); case flatbuf::Type::Bool: *out = boolean(); return Status::OK(); @@ -534,16 +540,24 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + static Status CheckForRawPointers(const BinaryViewType& type) { + if (type.has_raw_pointers()) { + return Status::NotImplemented( + type.ToString(), " cannot be serialized; convert to index/offset format first"); + } + return Status::OK(); + } + Status Visit(const BinaryViewType& type) { - // BinaryView will be written to IPC as a normal binary array - extra_type_metadata_[std::string{kSerializedStringViewKeyName}] = ""; - return Visit(BinaryType()); + fb_type_ = flatbuf::Type::BinaryView; + type_offset_ = flatbuf::CreateBinaryView(fbb_).Union(); + return CheckForRawPointers(type); } Status Visit(const StringViewType& type) { - // StringView will be written to IPC as a normal UTF8 string array - extra_type_metadata_[std::string{kSerializedStringViewKeyName}] = ""; - return Visit(StringType()); + fb_type_ = flatbuf::Type::Utf8View; + type_offset_ = flatbuf::CreateUtf8View(fbb_).Union(); + return CheckForRawPointers(type); } Status Visit(const LargeBinaryType& type) { @@ -853,12 +867,6 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos, } // NOTE: if extension type is unknown, we do not raise here and // simply return the storage type. - } else if (name_index = metadata->FindKey(std::string{kSerializedStringViewKeyName}); - name_index != -1) { - DCHECK(type->id() == Type::STRING || type->id() == Type::BINARY); - RETURN_NOT_OK(metadata->Delete(name_index)); - bool is_utf8 = type->id() == Type::STRING; - type = is_utf8 ? utf8_view() : binary_view(); } } @@ -985,6 +993,7 @@ static Status GetBodyCompression(FBB& fbb, const IpcWriteOptions& options, static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, + const std::vector& variadic_counts, const IpcWriteOptions& options, RecordBatchOffset* offset) { FieldNodeVector fb_nodes; RETURN_NOT_OK(WriteFieldNodes(fbb, nodes, &fb_nodes)); @@ -995,7 +1004,10 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, BodyCompressionOffset fb_compression; RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression)); - *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression); + auto fb_variadic_counts = fbb.CreateVector(variadic_counts); + + *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression, + fb_variadic_counts); return Status::OK(); } @@ -1242,11 +1254,12 @@ Status WriteRecordBatchMessage( int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, variadic_counts, + options, &record_batch)); return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(), body_length, options.metadata_version, custom_metadata, options.memory_pool) @@ -1303,11 +1316,12 @@ Status WriteDictionaryMessage( int64_t id, bool is_delta, int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, variadic_counts, + options, &record_batch)); auto dictionary_batch = flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union(); return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 6f07a8aea4f..631a336f75a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -201,7 +201,8 @@ Status WriteRecordBatchMessage( const int64_t length, const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); ARROW_EXPORT Result> WriteTensorMessage(const Tensor& tensor, @@ -225,7 +226,8 @@ Status WriteDictionaryMessage( const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); static inline Result> WriteFlatbufferBuilder( flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference @@ -245,8 +247,6 @@ flatbuf::TimeUnit ToFlatbufferUnit(TimeUnit::type unit); ARROW_EXPORT TimeUnit::type FromFlatbufferUnit(flatbuf::TimeUnit unit); -constexpr std::string_view kSerializedStringViewKeyName = "ARROW:string_view"; - } // namespace internal } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 8d4db5ea0e7..9fb37d4b9f3 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -159,7 +159,7 @@ TEST_P(TestMessage, SerializeCustomMetadata) { ASSERT_OK(internal::WriteRecordBatchMessage( /*length=*/0, /*body_length=*/0, metadata, /*nodes=*/{}, - /*buffers=*/{}, options_, &serialized)); + /*buffers=*/{}, /*variadic_counts=*/{}, options_, &serialized)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, Message::Open(serialized, /*body=*/nullptr)); @@ -2873,21 +2873,21 @@ void GetReadRecordBatchReadRanges( // 1) read magic and footer length IO // 2) read footer IO // 3) read record batch metadata IO - ASSERT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); + EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); const int32_t magic_size = static_cast(strlen(ipc::internal::kArrowMagicBytes)); // read magic and footer length IO auto file_end_size = magic_size + sizeof(int32_t); auto footer_length_offset = buffer->size() - file_end_size; auto footer_length = bit_util::FromLittleEndian( util::SafeLoadAs(buffer->data() + footer_length_offset)); - ASSERT_EQ(read_ranges[0].length, file_end_size); + EXPECT_EQ(read_ranges[0].length, file_end_size); // read footer IO - ASSERT_EQ(read_ranges[1].length, footer_length); + EXPECT_EQ(read_ranges[1].length, footer_length); // read record batch metadata. The exact size is tricky to determine but it doesn't // matter for this test and it should be smaller than the footer. - ASSERT_LT(read_ranges[2].length, footer_length); + EXPECT_LE(read_ranges[2].length, footer_length); for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { - ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); } } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index aa57d6c70de..efe81ce573e 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -244,6 +244,15 @@ class ArrayLoader { } } + Result GetVariadicCount(int i) { + auto* variadic_counts = metadata_->variadicCounts(); + CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicCounts"); + if (i >= static_cast(variadic_counts->size())) { + return Status::IOError("variadic_count_index out of range."); + } + return static_cast(variadic_counts->Get(i)); + } + Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); @@ -349,9 +358,18 @@ class ArrayLoader { } Status Visit(const BinaryViewType& type) { - // View arrays are serialized as the corresponding dense array. - // We can't produce the view array yet; the buffers may still be compressed. - return LoadBinary(type.id() == Type::STRING_VIEW ? Type::STRING : Type::BINARY); + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + + ARROW_ASSIGN_OR_RAISE(auto character_buffer_count, + GetVariadicCount(variadic_count_index_++)); + out_->buffers.resize(character_buffer_count + 2); + for (size_t i = 0; i < character_buffer_count; ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + } + return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { @@ -446,6 +464,7 @@ class ArrayLoader { int buffer_index_ = 0; int field_index_ = 0; bool skip_io_ = false; + int variadic_count_index_ = 0; BatchDataReadRequest read_request_; const Field* field_ = nullptr; @@ -525,49 +544,6 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op }); } -Status ConvertViewArrays(const IpcReadOptions& options, ArrayDataVector* fields) { - struct StringViewAccumulator { - using DataPtrVector = std::vector; - - void AppendFrom(const ArrayDataVector& fields) { - for (const auto& field : fields) { - if (field->type->id() == Type::STRING_VIEW || - field->type->id() == Type::BINARY_VIEW) { - view_arrays_.push_back(field.get()); - } - AppendFrom(field->child_data); - } - } - - DataPtrVector Get(const ArrayDataVector& fields) && { - AppendFrom(fields); - return std::move(view_arrays_); - } - - DataPtrVector view_arrays_; - }; - - auto view_arrays = StringViewAccumulator{}.Get(*fields); - - return ::arrow::internal::OptionalParallelFor( - options.use_threads, static_cast(view_arrays.size()), [&](int i) { - ArrayData* data = view_arrays[i]; - - // the only thing we need to fix here is replacing offsets with headers - ARROW_ASSIGN_OR_RAISE( - auto header_buffer, - AllocateBuffer(data->length * sizeof(StringHeader), options.memory_pool)); - - auto* headers = header_buffer->mutable_data_as(); - VisitArraySpanInline( - *data, [&](std::string_view v) { *headers++ = StringHeader{v}; }, - [&] { *headers++ = StringHeader{}; }); - - data->buffers[1] = std::move(header_buffer); - return Status::OK(); - }); -} - Result> LoadRecordBatchSubset( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, const std::vector* inclusion_mask, const IpcReadContext& context, @@ -616,13 +592,12 @@ Result> LoadRecordBatchSubset( RETURN_NOT_OK( DecompressBuffers(context.compression, context.options, &filtered_columns)); } - RETURN_NOT_OK(ConvertViewArrays(context.options, &filtered_columns)); // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { - for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { - ARROW_ASSIGN_OR_RAISE(filtered_columns[i], - arrow::internal::SwapEndianArrayData(filtered_columns[i])); + for (auto& filtered_column : filtered_columns) { + ARROW_ASSIGN_OR_RAISE(filtered_column, + arrow::internal::SwapEndianArrayData(filtered_column)); } } return RecordBatch::Make(std::move(filtered_schema), metadata->length(), @@ -866,11 +841,10 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, const Field dummy_field("", value_type); RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get())); - ArrayDataVector dict_fields{dict_data}; if (compression != Compression::UNCOMPRESSED) { + ArrayDataVector dict_fields{dict_data}; RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields)); } - RETURN_NOT_OK(ConvertViewArrays(context.options, &dict_fields)); // swap endian in dict_data if necessary (swap_endian == true) if (context.swap_endian) { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 37fb3ed3f84..6faaf96b332 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -351,26 +351,29 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls return builder.Finish(out); } -Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { +Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, + bool with_view_types) { const int64_t length = 500; ArrayVector arrays; FieldVector fields; - using namespace std::string_literals; - for (auto MakeArray : { - &MakeBinaryArrayWithUniqueValues, - &MakeBinaryArrayWithUniqueValues, - &MakeBinaryArrayWithUniqueValues, - &MakeBinaryArrayWithUniqueValues, - &MakeBinaryArrayWithUniqueValues, - &MakeBinaryArrayWithUniqueValues, - }) { + auto AppendColumn = [&](auto& MakeArray) { arrays.emplace_back(); RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); const auto& type = arrays.back()->type(); fields.push_back(field(type->ToString(), type)); + return Status::OK(); + }; + + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + if (with_view_types) { + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); } *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 5e0c65556c6..2d3447eceb0 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -96,7 +96,8 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatch(std::shared_ptr* out, - bool with_nulls = true); + bool with_nulls = true, + bool with_view_types = true); ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index e911abffa34..70d66ebfaaf 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -176,7 +176,8 @@ class RecordBatchSerializer { // Override this for writing dictionary metadata virtual Status SerializeMetadata(int64_t num_rows) { return WriteRecordBatchMessage(num_rows, out_->body_length, custom_metadata_, - field_nodes_, buffer_meta_, options_, &out_->metadata); + field_nodes_, buffer_meta_, variadic_counts_, options_, + &out_->metadata); } bool ShouldCompress(int64_t uncompressed_size, int64_t compressed_size) const { @@ -293,6 +294,8 @@ class RecordBatchSerializer { offset += size + padding; } + variadic_counts_ = out_->variadic_counts; + out_->body_length = offset - buffer_start_offset_; DCHECK(bit_util::IsMultipleOf8(out_->body_length)); @@ -401,49 +404,14 @@ class RecordBatchSerializer { } Status Visit(const BinaryViewArray& array) { - // a separate helper doesn't make sense here since we've already done the work - // to copy the bitmap - out_->body_buffers.emplace_back(); - ARROW_ASSIGN_OR_RAISE( - out_->body_buffers.back(), - AllocateBuffer(sizeof(int32_t) * (array.length() + 1), options_.memory_pool)); - - auto* offsets = out_->body_buffers.back()->mutable_data_as(); - offsets[0] = 0; - auto AppendOffset = [&](size_t size) mutable { - // ignore overflow for now - offsets[1] = arrow::internal::SafeSignedAdd(offsets[0], static_cast(size)); - ++offsets; - }; + auto headers = SliceBuffer(array.values(), array.offset() * sizeof(StringHeader), + array.length() * sizeof(StringHeader)); + out_->body_buffers.emplace_back(std::move(headers)); - int64_t size = 0; - VisitArraySpanInline( - *array.data(), - [&](std::string_view v) { - size += static_cast(v.size()); - AppendOffset(v.size()); - }, - [&] { AppendOffset(0); }); - - if (size > std::numeric_limits::max()) { - return Status::Invalid( - "Input view array viewed more characters than are representable with 32 bit " - "offsets, unable to serialize"); + out_->variadic_counts.emplace_back(array.data()->buffers.size() - 2); + for (size_t i = 2; i < array.data()->buffers.size(); ++i) { + out_->body_buffers.emplace_back(array.data()->buffers[i]); } - - out_->body_buffers.emplace_back(); - ARROW_ASSIGN_OR_RAISE(out_->body_buffers.back(), - AllocateBuffer(size, options_.memory_pool)); - - VisitArraySpanInline( - *array.data(), - [chars = out_->body_buffers.back()->mutable_data_as()]( - std::string_view v) mutable { - v.copy(chars, v.size()); - chars += v.size(); - }, - [] {}); - return Status::OK(); } @@ -634,6 +602,7 @@ class RecordBatchSerializer { std::vector field_nodes_; std::vector buffer_meta_; + std::vector variadic_counts_; const IpcWriteOptions& options_; int64_t max_recursion_depth_; @@ -650,8 +619,8 @@ class DictionarySerializer : public RecordBatchSerializer { Status SerializeMetadata(int64_t num_rows) override { return WriteDictionaryMessage(dictionary_id_, is_delta_, num_rows, out_->body_length, - custom_metadata_, field_nodes_, buffer_meta_, options_, - &out_->metadata); + custom_metadata_, field_nodes_, buffer_meta_, + variadic_counts_, options_, &out_->metadata); } Status Assemble(const std::shared_ptr& dictionary) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 9e18a213ba3..0b62c011d88 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -57,6 +57,7 @@ struct IpcPayload { MessageType type = MessageType::NONE; std::shared_ptr metadata; std::vector> body_buffers; + std::vector variadic_counts; int64_t body_length = 0; // serialized body length (padded, maybe compressed) int64_t raw_body_length = 0; // initial uncompressed body length }; diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc index 45db2346d28..304be7ffec8 100644 --- a/cpp/src/arrow/testing/json_internal.cc +++ b/cpp/src/arrow/testing/json_internal.cc @@ -46,6 +46,8 @@ #include "arrow/util/formatting.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/range.h" +#include "arrow/util/span.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/visit_array_inline.h" @@ -105,6 +107,11 @@ std::string GetTimeUnitName(TimeUnit::type unit) { return "UNKNOWN"; } +std::string_view GetStringView(const rj::Value& str) { + DCHECK(str.IsString()); + return {str.GetString(), str.GetStringLength()}; +} + class SchemaWriter { public: explicit SchemaWriter(const Schema& schema, const DictionaryFieldMapper& mapper, @@ -387,8 +394,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } - Status Visit(const StringViewType& type) { return WritePrimitive("utf8_view", type); } - Status Visit(const BinaryViewType& type) { return WritePrimitive("binary_view", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binaryview", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -535,22 +542,19 @@ class ArrayWriter { } } - // Binary, encode to hexadecimal. - template - enable_if_binary_like WriteDataValues( - const ArrayType& arr) { - for (int64_t i = 0; i < arr.length(); ++i) { - writer_->String(HexEncode(arr.GetView(i))); - } - } - - // UTF8 string, write as is - template - enable_if_string_like WriteDataValues( - const ArrayType& arr) { + template + std::enable_if_t::value || + is_fixed_size_binary_type::value> + WriteDataValues(const ArrayType& arr) { for (int64_t i = 0; i < arr.length(); ++i) { - auto view = arr.GetView(i); - writer_->String(view.data(), static_cast(view.size())); + if constexpr (Type::is_utf8) { + // UTF8 string, write as is + auto view = arr.GetView(i); + writer_->String(view.data(), static_cast(view.size())); + } else { + // Binary, encode to hexadecimal. + writer_->String(HexEncode(arr.GetView(i))); + } } } @@ -649,6 +653,52 @@ class ArrayWriter { writer_->EndArray(); } + template + void WriteStringHeaderField(const ArrayType& array) { + writer_->Key(kData); + writer_->StartArray(); + for (int64_t i = 0; i < array.length(); ++i) { + auto s = array.raw_values()[i]; + writer_->StartObject(); + writer_->Key("SIZE"); + writer_->Int64(s.size()); + if (s.IsInline()) { + writer_->Key("INLINED"); + if constexpr (IsUtf8) { + writer_->String(s.GetInlineData(), StringHeader::kInlineSize); + } else { + writer_->String(HexEncode(s.GetInlineData(), StringHeader::kInlineSize)); + } + } else { + writer_->Key("PREFIX"); + if constexpr (IsUtf8) { + writer_->String(s.GetPrefix().data(), StringHeader::kPrefixSize); + } else { + writer_->String(HexEncode(s.GetPrefix().data(), StringHeader::kPrefixSize)); + } + writer_->Key("BUFFER_INDEX"); + writer_->Int64(s.GetBufferIndex()); + writer_->Key("OFFSET"); + writer_->Int64(s.GetBufferOffset()); + } + writer_->EndObject(); + } + writer_->EndArray(); + } + + void WriteVariadicBuffersField(const BinaryViewArray& arr) { + writer_->Key("VARIADIC_BUFFERS"); + writer_->StartArray(); + const auto& buffers = arr.data()->buffers; + for (size_t i = 2; i < buffers.size(); ++i) { + // Encode the character buffers into hexadecimal strings. + // Even for arrays which contain utf-8, portions of the buffer not + // referenced by any view may be invalid. + writer_->String(buffers[i]->ToHexString()); + } + writer_->EndArray(); + } + void WriteValidityField(const Array& arr) { writer_->Key("VALIDITY"); writer_->StartArray(); @@ -689,8 +739,10 @@ class ArrayWriter { } template - enable_if_t::value, Status> Visit( - const ArrayType& array) { + enable_if_t::value && + !is_binary_view_like_type::value, + Status> + Visit(const ArrayType& array) { WriteValidityField(array); WriteDataField(array); SetNoChildren(); @@ -707,6 +759,21 @@ class ArrayWriter { return Status::OK(); } + template + enable_if_binary_view_like Visit( + const ArrayType& array) { + if (array.has_raw_pointers()) { + return Status::NotImplemented("serialization of ", array.type()->ToString()); + } + + WriteValidityField(array); + WriteStringHeaderField(array); + WriteVariadicBuffersField(array); + + SetNoChildren(); + return Status::OK(); + } + Status Visit(const DictionaryArray& array) { return VisitArrayValues(*array.indices()); } @@ -1068,6 +1135,10 @@ Status GetType(const RjObject& json_type, *type = utf8(); } else if (type_name == "binary") { *type = binary(); + } else if (type_name == "utf8view") { + *type = utf8_view(); + } else if (type_name == "binaryview") { + *type = binary_view(); } else if (type_name == "largeutf8") { *type = large_utf8(); } else if (type_name == "largebinary") { @@ -1344,7 +1415,7 @@ class ArrayReader { int64_t offset_end = ParseOffset(json_offsets[i + 1]); DCHECK(offset_end >= offset_start); - if (T::is_utf8) { + if constexpr (T::is_utf8) { auto str = val.GetString(); DCHECK(std::string(str).size() == static_cast(offset_end - offset_start)); RETURN_NOT_OK(builder.Append(str)); @@ -1370,8 +1441,97 @@ class ArrayReader { return FinishBuilder(&builder); } - Status Visit(const BinaryViewType& type) { - return Status::NotImplemented("Binary / string view"); + template + enable_if_binary_view_like Visit(const ViewType& type) { + ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_)); + ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs, + GetMemberArray(obj_, "VARIADIC_BUFFERS")); + + using internal::Zip; + using util::span; + + BufferVector buffers; + buffers.resize(json_variadic_bufs.Size() + 2); + for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) { + auto hex_string = GetStringView(json_buf); + ARROW_ASSIGN_OR_RAISE( + buf, AllocateBuffer(static_cast(hex_string.size()) / 2, pool_)); + RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data())); + } + + TypedBufferBuilder validity_builder{pool_}; + RETURN_NOT_OK(validity_builder.Resize(length_)); + for (bool is_valid : is_valid_) { + validity_builder.UnsafeAppend(is_valid); + } + ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish()); + + ARROW_ASSIGN_OR_RAISE(buffers[1], + AllocateBuffer(length_ * sizeof(StringHeader), pool_)); + + span headers{buffers[1]->mutable_data_as(), + static_cast(length_)}; + + int64_t null_count = 0; + for (auto [json_view, header, is_valid] : Zip(json_views, headers, is_valid_)) { + if (!is_valid) { + header = {}; + ++null_count; + continue; + } + + DCHECK(json_view.IsObject()); + const auto& json_view_obj = json_view.GetObject(); + + auto json_size = json_view_obj.FindMember("SIZE"); + RETURN_NOT_INT("SIZE", json_size, json_view_obj); + auto size = static_cast(json_size->value.GetInt64()); + + if (StringHeader::IsInline(size)) { + auto json_inlined = json_view_obj.FindMember("INLINED"); + RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj); + if constexpr (ViewType::is_utf8) { + DCHECK_EQ(json_inlined->value.GetStringLength(), StringHeader::kInlineSize); + header = StringHeader{json_inlined->value.GetString(), size}; + } else { + DCHECK_EQ(json_inlined->value.GetStringLength(), StringHeader::kInlineSize * 2); + std::array inlined; + RETURN_NOT_OK(ParseHexValues(GetStringView(json_inlined->value), + reinterpret_cast(inlined.data()))); + header = StringHeader{inlined.data(), size}; + } + continue; + } + + auto json_prefix = json_view_obj.FindMember("PREFIX"); + auto json_buffer_index = json_view_obj.FindMember("BUFFER_INDEX"); + auto json_offset = json_view_obj.FindMember("OFFSET"); + RETURN_NOT_STRING("PREFIX", json_prefix, json_view_obj); + RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj); + RETURN_NOT_INT("OFFSET", json_offset, json_view_obj); + + std::array prefix; + if constexpr (ViewType::is_utf8) { + DCHECK_EQ(json_prefix->value.GetStringLength(), StringHeader::kPrefixSize); + auto prefix_ptr = json_prefix->value.GetString(); + prefix = {prefix_ptr[0], prefix_ptr[1], prefix_ptr[2], prefix_ptr[3]}; + } else { + DCHECK_EQ(json_prefix->value.GetStringLength(), StringHeader::kPrefixSize * 2); + RETURN_NOT_OK(ParseHexValues(GetStringView(json_prefix->value), + reinterpret_cast(prefix.data()))); + } + + header = StringHeader{size, prefix, + static_cast(json_buffer_index->value.GetInt64()), + static_cast(json_offset->value.GetInt64())}; + + DCHECK_LE(header.GetBufferIndex(), buffers.size() - 2); + DCHECK_LE(static_cast(header.GetBufferOffset() + header.size()), + buffers[header.GetBufferIndex() + 2]->size()); + } + + data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count); + return Status::OK(); } Status Visit(const DayTimeIntervalType& type) { diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 8aa3c781365..33b618b4a23 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -430,11 +430,11 @@ std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( std::shared_ptr RandomArrayGenerator::StringView(int64_t size, int32_t min_length, int32_t max_length, - double null_probability, + double null_probability, int64_t alignment, MemoryPool* memory_pool) { - return GenerateBinaryArray(this, size, min_length, max_length, - null_probability, alignment, memory_pool); + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, alignment, memory_pool); } std::shared_ptr RandomArrayGenerator::StringWithRepeats( @@ -856,8 +856,15 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t GetMetadata(field.metadata().get(), "min_length", 0); const auto max_length = GetMetadata(field.metadata().get(), "max_length", 20); - return *StringView(length, min_length, max_length, null_probability) - ->View(field.type()); + + auto out = StringView(length, min_length, max_length, null_probability, alignment); + + if (internal::checked_cast(*field.type()) + .has_raw_pointers()) { + ABORT_NOT_OK(internal::SwapStringHeaderPointers( + *out->data(), out->data()->buffers[1]->mutable_data_as())); + } + return out->View(field.type()).ValueOrDie(); } case Type::type::DECIMAL128: diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 2f6d6354b28..cff6cdd1354 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -160,6 +160,7 @@ auto values = ::testing::Values( field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), field("int64", int64()), field("float16", float16()), field("float32", float32()), field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("string_view", utf8_view()), field("binary_view", binary_view()), field("fixed_size_binary", fixed_size_binary(8)), field("decimal128", decimal128(8, 3)), field("decimal128", decimal128(29, -5)), field("decimal256", decimal256(16, 4)), field("decimal256", decimal256(57, -6)), diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 4f4b03438fd..703593450e8 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -53,6 +53,14 @@ Status CopyBufferFromVector(const std::vector& values, MemoryPool* pool, return Status::OK(); } +template +Result> CopyBufferFromVector( + const std::vector& values, MemoryPool* pool = default_memory_pool()) { + std::shared_ptr out; + RETURN_NOT_OK(CopyBufferFromVector(values, pool, &out)); + return out; +} + // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. ARROW_TESTING_EXPORT void random_null_bytes(int64_t n, double pct_null, diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 1c0a7544cb8..1c040bec63b 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -601,13 +601,17 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } -std::string BinaryViewType::ToString() const { return "binary_view"; } +std::string BinaryViewType::ToString() const { + return raw_pointers_ ? "binary_view[RAW POINTERS]" : "binary_view"; +} std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } -std::string StringViewType::ToString() const { return "string_view"; } +std::string StringViewType::ToString() const { + return raw_pointers_ ? "string_view[RAW POINTERS]" : "string_view"; +} std::string LargeStringType::ToString() const { return "large_string"; } @@ -2541,14 +2545,24 @@ TYPE_FACTORY(float16, HalfFloatType) TYPE_FACTORY(float32, FloatType) TYPE_FACTORY(float64, DoubleType) TYPE_FACTORY(utf8, StringType) -TYPE_FACTORY(utf8_view, StringViewType) TYPE_FACTORY(large_utf8, LargeStringType) TYPE_FACTORY(binary, BinaryType) -TYPE_FACTORY(binary_view, BinaryViewType) TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) +const std::shared_ptr& utf8_view(bool has_raw_pointers) { + static std::shared_ptr io = std::make_shared(); + static std::shared_ptr raw = std::make_shared(true); + return has_raw_pointers ? raw : io; +} + +const std::shared_ptr& binary_view(bool has_raw_pointers) { + static std::shared_ptr io = std::make_shared(); + static std::shared_ptr raw = std::make_shared(true); + return has_raw_pointers ? raw : io; +} + std::shared_ptr fixed_size_binary(int32_t byte_width) { return std::make_shared(byte_width); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index e9b171d9d88..30615de71c3 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -727,7 +728,8 @@ class ARROW_EXPORT BinaryViewType : public DataType { static constexpr const char* type_name() { return "binary_view"; } - BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {} + explicit BinaryViewType(bool has_raw_pointers = false) + : BinaryViewType(Type::BINARY_VIEW, has_raw_pointers) {} DataTypeLayout layout() const override { return DataTypeLayout( @@ -738,11 +740,16 @@ class ARROW_EXPORT BinaryViewType : public DataType { std::string ToString() const override; std::string name() const override { return "binary_view"; } + bool has_raw_pointers() const { return raw_pointers_; } + protected: std::string ComputeFingerprint() const override; // Allow subclasses like StringType to change the logical type. - explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {} + explicit BinaryViewType(Type::type logical_type, bool has_raw_pointers) + : DataType(logical_type), raw_pointers_(has_raw_pointers) {} + + bool raw_pointers_ = false; }; /// \brief Concrete type class for large variable-size binary data @@ -800,7 +807,7 @@ class ARROW_EXPORT StringViewType : public BinaryViewType { static constexpr const char* type_name() { return "utf8_view"; } - StringViewType() : BinaryViewType(Type::STRING_VIEW) {} + explicit StringViewType(bool has_raw_pointers = false) : BinaryViewType(Type::STRING_VIEW, has_raw_pointers) {} std::string ToString() const override; std::string name() const override { return "utf8_view"; } diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 64f837f84aa..c0569fc091f 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -472,13 +472,13 @@ ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); /// \brief Return a StringViewType instance -ARROW_EXPORT const std::shared_ptr& utf8_view(); +ARROW_EXPORT const std::shared_ptr& utf8_view(bool has_raw_pointers = false); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr& binary(); /// \brief Return a BinaryViewType instance -ARROW_EXPORT const std::shared_ptr& binary_view(); +ARROW_EXPORT const std::shared_ptr& binary_view(bool has_raw_pointers = false); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 4d364655787..719cad42aa4 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -111,3 +111,4 @@ add_arrow_benchmark(thread_pool_benchmark) add_arrow_benchmark(trie_benchmark) add_arrow_benchmark(utf8_util_benchmark) add_arrow_benchmark(value_parsing_benchmark) +add_arrow_benchmark(string_conversion_benchmark) diff --git a/cpp/src/arrow/util/range.h b/cpp/src/arrow/util/range.h index ea0fb0eeaab..19af8f9c29f 100644 --- a/cpp/src/arrow/util/range.h +++ b/cpp/src/arrow/util/range.h @@ -23,9 +23,9 @@ #include #include #include +#include -namespace arrow { -namespace internal { +namespace arrow::internal { /// Create a vector containing the values from start up to stop template @@ -151,5 +151,55 @@ LazyRange MakeLazyRange(Generator&& gen, int64_t length) { return LazyRange(std::forward(gen), length); } -} // namespace internal -} // namespace arrow +/// \brief A helper for iterating multiple ranges simultaneously, modelled after python's +/// built-in zip() function. +/// +/// \code {.cpp} +/// const std::vector& tables = ... +/// std::function()> GetNames = ... +/// for (auto&& [table, name] : Zip(tables, GetNames())) { +/// static_assert(std::is_same_v); +/// static_assert(std::is_same_v); +/// // temporaries (like this vector of strings) are kept alive for the +/// // duration of a loop and are safely movable). +/// RegisterTableWithName(std::move(name), &table); +/// } +/// \endcode +template +struct Zip; + +template +Zip(Ranges&&...) -> Zip, std::index_sequence_for>; + +template +struct Zip, std::index_sequence> { + explicit Zip(Ranges... ranges) : ranges_(std::forward(ranges)...) {} + + std::tuple ranges_; + + using sentinel = std::tuple(ranges_).end())...>; + + struct iterator : std::tuple(ranges_).begin())...> { + using std::tuple(ranges_).begin())...>::tuple; + + constexpr auto operator*() { + return std::tuple(*this))...>{*std::get(*this)...}; + } + + constexpr iterator& operator++() { + (++std::get(*this), ...); + return *this; + } + + constexpr bool operator!=(const sentinel& s) const { + bool all_iterators_valid = (... && (std::get(*this) != std::get(s))); + return all_iterators_valid; + } + }; + + constexpr iterator begin() { return {std::get(ranges_).begin()...}; } + + constexpr sentinel end() { return {std::get(ranges_).end()...}; } +}; + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/sort.h b/cpp/src/arrow/util/sort.h index cdffe0b2317..bf37bf50700 100644 --- a/cpp/src/arrow/util/sort.h +++ b/cpp/src/arrow/util/sort.h @@ -24,11 +24,12 @@ #include #include -namespace arrow { -namespace internal { +#include "arrow/util/span.h" + +namespace arrow::internal { template > -std::vector ArgSort(const std::vector& values, Cmp&& cmp = {}) { +std::vector ArgSort(arrow::util::span values, Cmp&& cmp = {}) { std::vector indices(values.size()); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), @@ -36,6 +37,11 @@ std::vector ArgSort(const std::vector& values, Cmp&& cmp = {}) { return indices; } +template +std::vector ArgSort(const Range& values, Cmp&&... cmp) { + return ArgSort(arrow::util::span{values}, std::forward(cmp)...); +} + template size_t Permute(const std::vector& indices, std::vector* values) { if (indices.size() <= 1) { @@ -74,5 +80,4 @@ size_t Permute(const std::vector& indices, std::vector* values) { return cycle_count; } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/span.h b/cpp/src/arrow/util/span.h new file mode 100644 index 00000000000..cda0ea9548b --- /dev/null +++ b/cpp/src/arrow/util/span.h @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace arrow::util { + +/// std::span polyfill +template +class span { + static_assert(sizeof(T), + R"( +std::span allows contiguous_iterators instead of just pointers, the enforcement +of which requires T to be a complete type. arrow::util::span does not support +contiguous_iterators, but T is still required to be a complete type to prevent +writing code which would break when it is replaced by std::span.)"); + + public: + using element_type = T; + + span() = default; + span(const span&) = default; + span& operator=(const span&) = default; + + template >> + // NOLINTNEXTLINE runtime/explicit + constexpr span(span mut) : span{mut.data(), mut.size()} {} + + constexpr span(T* data, std::size_t count) : data_{data}, size_{count} {} + + constexpr span(T* begin, T* end) + : data_{begin}, size_{static_cast(end - begin)} {} + + template < + typename R, + typename = std::enable_if_t().data()), T*>>, + typename = std::enable_if_t< + std::is_convertible_v().size()), std::size_t>>> + // NOLINTNEXTLINE runtime/explicit, non-const reference + constexpr span(R& range) : span{range.data(), range.size()} {} + + constexpr T* begin() const { return data_; } + constexpr T* end() const { return data_ + size_; } + constexpr T* data() const { return data_; } + + constexpr std::size_t size() const { return size_; } + constexpr std::size_t size_bytes() const { return size_ * sizeof(T); } + constexpr bool empty() const { return size_ == 0; } + + constexpr T& operator[](std::size_t i) { return data_[i]; } + constexpr const T& operator[](std::size_t i) const { return data_[i]; } + + constexpr span subspan(std::size_t offset) const { + if (offset > size_) return {}; + return {data_ + offset, size_ - offset}; + } + + constexpr span subspan(std::size_t offset, std::size_t count) const { + auto out = subspan(offset); + if (count < out.size_) { + out.size_ = count; + } + return out; + } + + constexpr bool operator==(span const& other) const { + if (size_ != other.size_) return false; + + T* ptr = data_; + for (T const& e : other) { + if (*ptr++ != e) return false; + } + return true; + } + constexpr bool operator!=(span const& other) const { + return !(*this == other); + } + + private: + T* data_{}; + std::size_t size_{}; +}; + +template +span(R&) -> span().data())>>; + +template +constexpr span as_bytes(span s) { + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +template +constexpr span as_writable_bytes(span s) { + return {reinterpret_cast(s.data()), s.size_bytes()}; +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index 2055b4f47ea..07dfc2ce79f 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -90,6 +90,16 @@ Status ParseHexValue(const char* data, uint8_t* out) { return Status::OK(); } +Status ParseHexValues(std::string_view hex_string, uint8_t* out) { + if (hex_string.size() % 2 != 0) { + return Status::Invalid("Expected base16 hex string"); + } + for (size_t j = 0; j < hex_string.size() / 2; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_string.data() + j * 2, out + j)); + } + return Status::OK(); +} + namespace internal { std::vector SplitString(std::string_view v, char delimiter, diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d9777efc56a..d7e377773f6 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -46,7 +46,9 @@ ARROW_EXPORT std::string HexEncode(std::string_view str); ARROW_EXPORT std::string Escape(std::string_view str); -ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out); +ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out); + +ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { diff --git a/cpp/src/arrow/util/string_conversion_benchmark.cc b/cpp/src/arrow/util/string_conversion_benchmark.cc new file mode 100644 index 00000000000..2b50ea40545 --- /dev/null +++ b/cpp/src/arrow/util/string_conversion_benchmark.cc @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/unreachable.h" +#include "arrow/visit_data_inline.h" +#include "benchmark/benchmark.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array/array_binary.h" +#include "arrow/array/builder_binary.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/util/formatting.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/value_parsing.h" + +namespace arrow::internal { +namespace { + +// Matrix of benchmarks: +// +// Direction: +// - STRING <-> RAW VIEWS +// - STRING <-> IO VIEWS +// - IO VIEWS <-> RAW VIEWS +// +// View length: +// - pure inline +// - pure non-inline +// - mixed with small mean length +// - mixed with large mean length +// +// Character buffer count: +// - ensure there are multiple 1MB buffers +// - baseline with only a single character buffer? +constexpr int kCharacterCount = (1 << 20) * 16; + +// Null counts? +// Scrambled ordering? + +enum { + kStrings, + kRawPointerViews, + kIndexOffsetViews, +}; +std::shared_ptr DataTypeFor(decltype(kStrings) enm) { + switch (enm) { + case kStrings: + return utf8(); + case kIndexOffsetViews: + return utf8_view(); + case kRawPointerViews: + return utf8_view(/*has_raw_pointers=*/true); + } + Unreachable(); +} + +enum { + kAlwaysInlineable, + kUsuallyInlineable, + kShortButNeverInlineable, + kLongAndSeldomInlineable, + kLongAndNeverInlineable, +}; + +StringViewArray ToStringViewArray(const StringArray& arr) { + auto header_buffer = AllocateBuffer(arr.length() * sizeof(StringHeader)).ValueOrDie(); + + StringHeadersFromStrings(*arr.data(), header_buffer->mutable_data_as()); + + return {arr.length(), + std::move(header_buffer), + {arr.value_data()}, + arr.null_bitmap(), + arr.null_count()}; +} + +StringArray ToStringArray(const StringViewArray& arr) { + int64_t char_count = 0; + for (size_t i = 2; i < arr.data()->buffers.size(); ++i) { + char_count += arr.data()->buffers[i]->size(); + } + + auto offset_buffer = AllocateBuffer((arr.length() + 1) * sizeof(int32_t)).ValueOrDie(); + auto* offset = offset_buffer->mutable_data_as(); + offset[0] = 0; + + BufferBuilder char_buffer_builder; + ABORT_NOT_OK(char_buffer_builder.Reserve(char_count)); + + ABORT_NOT_OK(VisitArraySpanInline( + *arr.data(), + [&](std::string_view v) { + offset[1] = offset[0] + v.size(); + ++offset; + return char_buffer_builder.Append(v); + }, + [&] { + offset[1] = offset[0]; + ++offset; + return Status::OK(); + })); + + auto char_buffer = char_buffer_builder.Finish().ValueOrDie(); + + return {arr.length(), std::move(offset_buffer), std::move(char_buffer), + arr.null_bitmap(), arr.null_count()}; +} + +std::shared_ptr ToRawPointers(const StringViewArray& io) { + auto raw_buf = AllocateBuffer(io.length() * sizeof(StringHeader)).ValueOrDie(); + auto st = + SwapStringHeaderPointers(*io.data(), raw_buf->mutable_data_as()); + ABORT_NOT_OK(st); + return raw_buf; +} + +std::shared_ptr ToIndexOffsets(const StringViewArray& raw) { + auto io_buf = AllocateBuffer(raw.length() * sizeof(StringHeader)).ValueOrDie(); + auto st = + SwapStringHeaderPointers(*raw.data(), io_buf->mutable_data_as()); + ABORT_NOT_OK(st); + return io_buf; +} + +template +static void ConvertViews(benchmark::State& state) { // NOLINT non-const reference + auto [min_length, max_length] = [] { + switch (StringLengthsAre) { + case kAlwaysInlineable: + return std::pair{0, 12}; + case kUsuallyInlineable: + return std::pair{0, 16}; + case kShortButNeverInlineable: + return std::pair{13, 30}; + case kLongAndSeldomInlineable: + return std::pair{0, 256}; + case kLongAndNeverInlineable: + return std::pair{13, 256}; + } + }(); + + auto num_items = kCharacterCount / max_length; + + auto from_type = DataTypeFor(From); + auto to_type = DataTypeFor(To); + + auto from = random::GenerateArray(*field("", from_type, + key_value_metadata({ + {"null_probability", "0"}, + {"min_length", std::to_string(min_length)}, + {"max_length", std::to_string(max_length)}, + })), + num_items, 0xdeadbeef); + + uint64_t dummy = 0; + for (auto _ : state) { + if constexpr (From == kStrings && To == kIndexOffsetViews) { + dummy += ToStringViewArray(checked_cast(*from)).length(); + } + + if constexpr (From == kIndexOffsetViews && To == kStrings) { + dummy += ToStringArray(checked_cast(*from)).length(); + } + + if constexpr (From == kIndexOffsetViews && To == kRawPointerViews) { + dummy += ToRawPointers(checked_cast(*from))->size(); + } + + if constexpr (From == kRawPointerViews && To == kIndexOffsetViews) { + dummy += ToIndexOffsets(checked_cast(*from))->size(); + } + + benchmark::DoNotOptimize(dummy); + } + state.SetItemsProcessed(state.iterations() * num_items); +} + +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kIndexOffsetViews, kLongAndNeverInlineable); + +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kStrings, kLongAndNeverInlineable); + +/* +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kStrings, kRawPointerViews, kLongAndNeverInlineable); + +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kStrings, kLongAndNeverInlineable); + */ + +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, + kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, + kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kRawPointerViews, kIndexOffsetViews, + kLongAndNeverInlineable); + +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, kAlwaysInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, kUsuallyInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, + kShortButNeverInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, + kLongAndSeldomInlineable); +BENCHMARK_TEMPLATE(ConvertViews, kIndexOffsetViews, kRawPointerViews, + kLongAndNeverInlineable); + +} // namespace +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/string_header.cc b/cpp/src/arrow/util/string_header.cc new file mode 100644 index 00000000000..e12d4553b16 --- /dev/null +++ b/cpp/src/arrow/util/string_header.cc @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/util/string_header.h" + +namespace arrow { + +std::ostream& operator<<(std::ostream& os, const StringHeader& header) { + os.write(header.data(), header.size()); + return os; +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h index 182b749f219..8a202e54b4f 100644 --- a/cpp/src/arrow/util/string_header.h +++ b/cpp/src/arrow/util/string_header.h @@ -37,49 +37,66 @@ #include #include #include -#include +#include #include #include namespace arrow { -// Variable length string or binary with 4 byte prefix and inline optimization -// for small values (12 bytes or fewer). This is similar to std::string_view -// except that the referenced is limited in size to UINT32_MAX and up to the -// first four bytes of the string are copied into the struct. The prefix allows -// failing comparisons early and can reduce the CPU cache working set when -// dealing with short strings. -// -// Short string |----|----|--------| -// ^ ^ ^ -// | | | -// size prefix remaining in-line portion -// -// Long string |----|----|--------| -// ^ ^ ^ -// | | | -// size prefix pointer to out-of-line portion -// -// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. -// -// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +/// Variable length string or binary with 4 byte prefix and inline optimization +/// for small values (12 bytes or fewer). This is similar to std::string_view +/// except that the referenced is limited in size to UINT32_MAX and up to the +/// first four bytes of the string are copied into the struct. The prefix allows +/// failing comparisons early and can reduce the CPU cache working set when +/// dealing with short strings. +/// +/// This structure supports three states: +/// +/// Short string |----|----|--------| +/// ^ ^ ^ +/// | | | +/// size prefix remaining in-line portion, zero padded +/// +/// Long string |----|----|--------| +/// ^ ^ ^ +/// | | | +/// size prefix raw pointer to out-of-line portion +/// +/// IO Long string |----|----|----|----| +/// ^ ^ ^ ^ +/// | | | `----------. +/// size prefix buffer index and offset to out-of-line portion +/// +/// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. +/// +/// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +/// +/// There is no way to determine from a non-inline StringHeader whether it refers +/// to its out-of-line portion with a raw pointer or with index/offset. This +/// information is stored at the column level; so a buffer of StringHeader will +/// contain only one or the other. In general unless a StringHeader is resident +/// in a StringView array's buffer it will refer to out-of-line data with a raw +/// pointer. This default is assumed by several members of StringHeader such as +/// operator==() and operator string_view() since these and other operations cannot +/// be performed on index/offset StringHeaders without also accessing the buffers +/// storing their out-of-line data. Which states pertain to each accessor and +/// constructor are listed in their comments. struct StringHeader { public: using value_type = char; - static constexpr size_t kPrefixSize = 4; - static constexpr size_t kInlineSize = 12; + static constexpr size_t kTotalSize = 16; + static constexpr size_t kSizeSize = sizeof(uint32_t); + static constexpr size_t kIndexOffsetSize = sizeof(uint32_t) * 2; + static constexpr size_t kPrefixSize = kTotalSize - kSizeSize - kIndexOffsetSize; + static_assert(kPrefixSize == 4); + static constexpr size_t kInlineSize = kTotalSize - kPrefixSize; + static_assert(kInlineSize == 12); + /// Construct an empty view. StringHeader() = default; - static StringHeader makeInline(uint32_t size, char** data) { - assert(size <= kInlineSize); - StringHeader s; - s.size_ = size; - *data = const_cast(s.data()); - return s; - } - + /// Construct a RAW POINTER view. StringHeader(const char* data, size_t len) : size_(static_cast(len)) { if (size_ == 0) return; @@ -95,41 +112,70 @@ struct StringHeader { } } + /// Construct a RAW POINTER view. StringHeader(const uint8_t* data, int64_t len) : StringHeader(reinterpret_cast(data), static_cast(len)) {} - // Making StringHeader implicitly constructible/convertible from char* and - // string literals, in order to allow for a more flexible API and optional - // interoperability. E.g: - // - // StringHeader bh = "literal"; - // std::optional obh = "literal"; - // - // NOLINTNEXTLINE runtime/explicit - StringHeader(const char* data) : StringHeader(data, strlen(data)) {} + /// Convenience implicit constructor for RAW POINTER views from C string/string literal. + /// + /// NOLINTNEXTLINE runtime/explicit + StringHeader(const char* data) : StringHeader(data, std::strlen(data)) {} + /// Construct a RAW POINTER view. explicit StringHeader(const std::string& value) : StringHeader(value.data(), value.size()) {} + /// Construct a RAW POINTER view. explicit StringHeader(std::string_view value) : StringHeader(value.data(), value.size()) {} + /// Construct an INDEX/OFFSET view. + StringHeader(const char* data, uint32_t len, uint32_t buffer_index, + const char* buffer_data) + : size_(len) { + if (size_ == 0) return; + + // TODO(bkietz) better option than assert? + assert(data); + if (IsInline()) { + // small string: inlined. Bytes beyond size_ are already 0 + memcpy(prefix_.data(), data, size_); + } else { + // large string: store index/offset + memcpy(prefix_.data(), data, kPrefixSize); + SetIndexOffset(buffer_index, static_cast(data - buffer_data)); + } + } + + /// Construct an INDEX/OFFSET view. + StringHeader(uint32_t len, std::array prefix, uint32_t buffer_index, + uint32_t offset) + : size_(len), prefix_(prefix) { + SetIndexOffset(buffer_index, offset); + } + + /// True if the view's data is entirely stored inline. + /// This function is safe for use against both RAW POINTER and INDEX/OFFSET views. bool IsInline() const { return IsInline(size_); } static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; } + static constexpr bool IsInline(size_t size) { return size <= kInlineSize; } + static constexpr bool IsInline(int64_t size) { + return size <= static_cast(kInlineSize); + } + /// Return a RAW POINTER view's data. const char* data() const& { return IsInline() ? prefix_.data() : value_.data; } const char* data() && = delete; + /// The number of characters viewed by this StringHeader. + /// This function is safe for use against both RAW POINTER and INDEX/OFFSET views. size_t size() const { return size_; } - size_t capacity() const { return size_; } - - friend std::ostream& operator<<(std::ostream& os, const StringHeader& header) { - os.write(header.data(), header.size()); - return os; - } + /// Print a RAW POINTER view to a std::ostream. + friend std::ostream& operator<<(std::ostream& os, const StringHeader& header); + /// Equality comparison between RAW POINTER views. bool operator==(const StringHeader& other) const { // Compare lengths and first 4 characters. if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) { @@ -146,8 +192,90 @@ struct StringHeader { size_ - kPrefixSize) == 0; } + /// Inequality comparison between RAW POINTER views. bool operator!=(const StringHeader& other) const { return !(*this == other); } + /// Less-than comparison between RAW POINTER views. + bool operator<(const StringHeader& other) const { return Compare(other) < 0; } + + /// Less-than-or-equal comparison between RAW POINTER views. + bool operator<=(const StringHeader& other) const { return Compare(other) <= 0; } + + /// Greater-than comparison between RAW POINTER views. + bool operator>(const StringHeader& other) const { return Compare(other) > 0; } + + /// Greater-than-or-equal comparison between RAW POINTER views. + bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; } + + /// Conversion to std::string_view for RAW POINTER views. + explicit operator std::string_view() const& { return {data(), size()}; } + explicit operator std::string_view() && = delete; + + /// Return the always-inline cached first 4 bytes of this StringHeader. + /// This function is safe for use against both RAW POINTER and INDEX/OFFSET views. + std::array GetPrefix() const { return prefix_; } + + /// Return an INDEX/OFFSET view's buffer index. + uint32_t GetBufferIndex() const { return value_.io_data.buffer_index; } + + /// Return an INDEX/OFFSET view's buffer offset. + uint32_t GetBufferOffset() const { return value_.io_data.offset; } + + /// Return a RAW POINTER view's data pointer. + /// + /// NOT VALID FOR INLINE VIEWS. + const char* GetRawPointer() const { return value_.data; } + + /// Return an INLINE view's data pointer. + /// + /// NOT VALID FOR VIEWS WHICH ARE NOT INLINE. + const char* GetInlineData() const& { + assert(IsInline()); + return prefix_.data(); + } + const char* GetInlineData() && = delete; + + /// Mutate into a RAW POINTER view. + /// + /// This function is only intended for use in converting from an equivalent INDEX/OFFSET + /// view; in particular it does not check or modify the prefix for consistency with the + /// new data pointer. + void SetRawPointer(const char* data) { value_.data = data; } + + /// Mutate into an INDEX/OFFSET view. + /// + /// This function is only intended for use in converting from an equivalent RAW POINTER + /// view; in particular it does not check or modify the prefix for consistency with the + /// new buffer index/offset. + void SetIndexOffset(uint32_t buffer_index, uint32_t offset) { + value_.io_data = {buffer_index, offset}; + } + + /// Compare an INDEX/OFFSET view in place. + /// + /// Equivalent comparison will be accomplished by (for example) first converting both + /// views to std::string_view and comparing those, but this would not take advantage + /// of the cached 4 byte prefix. + template + bool EqualsIndexOffset(const BufferPtr* char_buffers, const StringHeader& other, + const BufferPtr* other_char_buffers) const { + if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) { + return false; + } + if (IsInline()) { + return InlinedAsInt64() == other.InlinedAsInt64(); + } + auto* ptr = + char_buffers[GetBufferIndex()]->template data_as() + GetBufferOffset(); + auto* other_ptr = + other_char_buffers[other.GetBufferIndex()]->template data_as() + + other.GetBufferOffset(); + // Sizes are equal and this is not inline, therefore both are out of line and we + // have already checked that their kPrefixSize first characters are equal. + return memcmp(ptr + kPrefixSize, other_ptr + kPrefixSize, size() - kPrefixSize) == 0; + } + + private: // Returns 0, if this == other // < 0, if this < other // > 0, if this > other @@ -171,48 +299,35 @@ struct StringHeader { return (result != 0) ? result : size_ - other.size_; } - bool operator<(const StringHeader& other) const { return Compare(other) < 0; } - - bool operator<=(const StringHeader& other) const { return Compare(other) <= 0; } - - bool operator>(const StringHeader& other) const { return Compare(other) > 0; } - - bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; } - - std::string GetString() const { return std::string(data(), size()); } - - explicit operator std::string_view() const& { return std::string_view(data(), size()); } - operator std::string_view() && = delete; - - const char* begin() const& { return data(); } - const char* end() const& { return data() + size(); } - - const char* begin() && = delete; - const char* end() && = delete; - - bool empty() const { return size() == 0; } - - private: - inline int64_t SizeAndPrefixAsInt64() const { + int64_t SizeAndPrefixAsInt64() const { return reinterpret_cast(this)[0]; } - inline int64_t InlinedAsInt64() const { - return reinterpret_cast(this)[1]; - } + int64_t InlinedAsInt64() const { return reinterpret_cast(this)[1]; } int32_t PrefixAsInt() const { return *reinterpret_cast(&prefix_); } + // FIXME(bkietz) replace this with a std::array and forgo the union. + // Type punning (AKA violation of the strict aliasing rule) is undefined behavior. + // Using memcpy to access the bytes of the object representation of trivially copyable + // objects is not undefined behavior. Given sufficiently explicit hints on alignment + // and size, compilers elide memcpy calls in favor of identical assembly to what + // the type punning implementation produces. // We rely on all members being laid out top to bottom . C++ // guarantees this. uint32_t size_ = 0; - std::array prefix_ = {0}; + std::array prefix_ = {0}; union { std::array inlined = {0}; const char* data; + struct { + uint32_t buffer_index; + uint32_t offset; + } io_data; } value_; }; -static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes"); +static_assert(sizeof(StringHeader) == 16, "struct size expected to be exactly 16 bytes"); +static_assert(alignof(StringHeader) == 8, "struct alignment expected to be exactly 8 bytes"); } // namespace arrow diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 41f1730b339..a6793210a89 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -149,26 +149,38 @@ template struct ArraySpanInlineVisitor> { using c_type = std::string_view; + static std::string_view GetView(const StringHeader& s, + const std::shared_ptr* char_buffers) { + if (!s.IsInline()) { + const auto& buffer = char_buffers[s.GetBufferIndex()]; + return std::string_view{buffer->data_as() + s.GetBufferOffset(), s.size()}; + } + return std::string_view{s.GetInlineData(), s.size()}; + } + + static const std::shared_ptr* GetCharBuffers(const ArraySpan& arr) { + return reinterpret_cast*>(arr.buffers[2].data); + } + template static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) { if (arr.length == 0) { return Status::OK(); } - const StringHeader* headers; - if (arr.buffers[1].data == NULLPTR) { - headers = NULLPTR; + auto* s = arr.GetValues(1); + if (checked_cast(arr.type)->has_raw_pointers()) { + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { return valid_func(std::string_view{s[index]}); }, + [&]() { return null_func(); }); } else { - // Do not apply the array offset to the values array; the value_offsets - // index the non-sliced values array. - headers = arr.GetValues(1); + auto* char_buffers = GetCharBuffers(arr); + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { return valid_func(GetView(s[index], char_buffers)); }, + [&]() { return null_func(); }); } - return VisitBitBlocks( - arr.buffers[0].data, arr.offset, arr.length, - [&](int64_t(index)) { - return valid_func(static_cast(headers[index])); - }, - [&]() { return null_func(); }); } template @@ -177,21 +189,19 @@ struct ArraySpanInlineVisitor> { if (arr.length == 0) { return; } - const StringHeader* headers; - if (arr.buffers[1].data == NULLPTR) { - headers = NULLPTR; + auto* s = arr.GetValues(1); + if (checked_cast(arr.type)->has_raw_pointers()) { + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(std::string_view{s[index]}); }, + std::forward(null_func)); } else { - // Do not apply the array offset to the values array; the value_offsets - // index the non-sliced values array. - headers = arr.GetValues(1); + auto* char_buffers = GetCharBuffers(arr); + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(GetView(s[index], char_buffers)); }, + std::forward(null_func)); } - - VisitBitBlocksVoid( - arr.buffers[0].data, arr.offset, arr.length, - [&](int64_t(index)) { - valid_func(static_cast(headers[index])); - }, - std::forward(null_func)); } }; @@ -324,9 +334,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { diff --git a/cpp/src/generated/File_generated.h b/cpp/src/generated/File_generated.h index 5b219f1eb0e..06953c4a040 100644 --- a/cpp/src/generated/File_generated.h +++ b/cpp/src/generated/File_generated.h @@ -26,18 +26,15 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Block FLATBUFFERS_FINAL_CLASS { int64_t bodyLength_; public: - Block() - : offset_(0), - metaDataLength_(0), - padding0__(0), - bodyLength_(0) { - (void)padding0__; + Block() { + memset(static_cast(this), 0, sizeof(Block)); } Block(int64_t _offset, int32_t _metaDataLength, int64_t _bodyLength) : offset_(flatbuffers::EndianScalar(_offset)), metaDataLength_(flatbuffers::EndianScalar(_metaDataLength)), padding0__(0), bodyLength_(flatbuffers::EndianScalar(_bodyLength)) { + (void)padding0__; } /// Index to the start of the RecordBlock (note this is past the Message header) int64_t offset() const { @@ -122,6 +119,7 @@ struct FooterBuilder { : fbb_(_fbb) { start_ = fbb_.StartTable(); } + FooterBuilder &operator=(const FooterBuilder &); flatbuffers::Offset