From c35b86c62d2b5c3f4e6dbc02d777ba85ec90f243 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 1 Mar 2023 18:41:41 +0100 Subject: [PATCH 01/78] Initial commit --- cpp/src/parquet/encoding.cc | 127 +++++++++++++++++++++ cpp/src/parquet/encoding_test.cc | 64 +++++++++++ python/pyarrow/tests/parquet/test_basic.py | 8 ++ 3 files changed, 199 insertions(+) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e972a86ccf0..e7e0ea09fcb 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2977,6 +2977,125 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { // ---------------------------------------------------------------------- // DELTA_BYTE_ARRAY +// This is also known as incremental encoding or front compression: for each element in a +// sequence of strings, store the prefix length of the previous entry plus the suffix. +// +// This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED), +// followed by the suffixes encoded as delta length byte arrays (DELTA_LENGTH_BYTE_ARRAY). + +// ---------------------------------------------------------------------- +// DeltaByteArrayEncoder + +template +class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder { + public: + using T = typename DType::c_type; + + explicit DeltaByteArrayEncoder(const ColumnDescriptor* descr, MemoryPool* pool) + : EncoderImpl(descr, Encoding::DELTA_BYTE_ARRAY, + pool = ::arrow::default_memory_pool()), + sink_(pool), + prefix_length_encoder_(nullptr, pool), + suffix_encoder_(nullptr, pool) {} + + std::shared_ptr FlushValues() override; + + int64_t EstimatedDataEncodedSize() override { + return prefix_length_encoder_.EstimatedDataEncodedSize() + + suffix_encoder_.EstimatedDataEncodedSize(); + } + + using TypedEncoder::Put; + + void Put(const ::arrow::Array& values) override { + AssertBaseBinary(values); + const auto& data = values.data(); + auto src = data->GetValues(1); + + if (values.null_count() == 0) { + Put(src, static_cast(values.length())); + } else { + PutSpaced(src, static_cast(data->length), data->GetValues(0, 0), + data->offset); + } + } + + void Put(const T* buffer, int num_values) override; + + void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + if (valid_bits != NULLPTR) { + PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), + this->memory_pool())); + T* data = reinterpret_cast(buffer->mutable_data()); + int num_valid_values = ::arrow::util::internal::SpacedCompress( + src, num_values, valid_bits, valid_bits_offset, data); + Put(data, num_valid_values); + } else { + Put(src, num_values); + } + } + + uint32_t total_value_count_{0}; + ::arrow::BufferBuilder sink_; + DeltaBitPackEncoder prefix_length_encoder_; + DeltaLengthByteArrayEncoder suffix_encoder_; + string_view last_value_; +}; + +template +void DeltaByteArrayEncoder::Put(const T* src, int num_values) { + if (num_values == 0) { + return; + } + ArrowPoolVector prefix_lengths(num_values); + + if (ARROW_PREDICT_TRUE(last_value_.empty())) { + last_value_ = string_view{reinterpret_cast(src[0].ptr), src[0].len}; + suffix_encoder_.Put(&src[0], 1); + prefix_lengths[0] = 0; + } + total_value_count_ += num_values; + + for (int32_t i = 1; i < num_values; i++) { + auto prefix = string_view{reinterpret_cast(src[i].ptr), src[i].len}; + + size_t j = 0; + while (j < std::min(src[i - 1].len, src[i].len)) { + if (last_value_[j] != prefix[j]) { + break; + } + j++; + } + + prefix_lengths[i] = j; + const uint8_t* suffix_ptr = src[i].ptr + j; + const uint32_t suffix_length = static_cast(src[i].len - j); + last_value_ = string_view{reinterpret_cast(suffix_ptr), suffix_length}; + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); + } + prefix_length_encoder_.Put(prefix_lengths.data(), num_values); +} + +template +std::shared_ptr DeltaByteArrayEncoder::FlushValues() { + PARQUET_THROW_NOT_OK(sink_.Resize(EstimatedDataEncodedSize(), false)); + + std::shared_ptr prefix_lengths = prefix_length_encoder_.FlushValues(); + PARQUET_THROW_NOT_OK(sink_.Append(prefix_lengths->data(), prefix_lengths->size())); + + std::shared_ptr suffixes = suffix_encoder_.FlushValues(); + PARQUET_THROW_NOT_OK(sink_.Append(suffixes->data(), suffixes->size())); + + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true)); + return buffer; +} + +// ---------------------------------------------------------------------- +// DeltaByteArrayDecoder + class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder { public: @@ -3353,6 +3472,14 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin default: throw ParquetException("RLE only supports BOOLEAN"); } + } else if (encoding == Encoding::DELTA_BYTE_ARRAY) { + switch (type_num) { + case Type::BYTE_ARRAY: + return std::make_unique>(descr, pool); + default: + throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY"); + break; + } } else { ParquetException::NYI("Selected encoding is not supported"); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 0ac5fd76e79..da33bb2e6db 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1977,4 +1977,68 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encode/decode tests. + +template +class TestDeltaByteArrayEncoding : public TestEncodingBase { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + virtual void CheckRoundtrip() { + auto encoder = + MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, false, descr_.get()); + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults(decode_buf_, draws_, num_values_)); + } + + void CheckRoundtripSpaced(const uint8_t* valid_bits, int64_t valid_bits_offset) { + auto encoder = + MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, false, descr_.get()); + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); + int null_count = 0; + for (auto i = 0; i < num_values_; i++) { + if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + null_count++; + } + } + + encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); + encode_buffer_ = encoder->FlushValues(); + decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + static_cast(encode_buffer_->size())); + auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, + valid_bits, valid_bits_offset); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced(decode_buf_, draws_, num_values_, + valid_bits, valid_bits_offset)); + } + + protected: + USING_BASE_MEMBERS(); +}; + +typedef ::testing::Types TestDeltaByteArrayEncodingTypes; +TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); + +TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + ASSERT_NO_FATAL_FAILURE(this->Execute(250, 2)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_prob*/ 0)); + + ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 200)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + /*null_probability*/ 0.1)); +} } // namespace parquet::test diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 9bc59cbcf96..2d983325297 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -426,6 +426,14 @@ def test_column_encoding(use_legacy_dataset): 'c': "DELTA_LENGTH_BYTE_ARRAY"}, use_legacy_dataset=use_legacy_dataset) + # Check "DELTA_BYTE_ARRAY" for byte columns. + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'a': "PLAIN", + 'b': "DELTA_BINARY_PACKED", + 'c': "DELTA_BYTE_ARRAY"}, + use_legacy_dataset=use_legacy_dataset) + # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. # This should throw an error as it is only supports FLOAT and DOUBLE. with pytest.raises(IOError, From 0eeea9639395374b49a708593335a0b7adae0510 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 1 Mar 2023 19:32:29 +0100 Subject: [PATCH 02/78] Adding PutBinaryArray --- cpp/src/parquet/encoding.cc | 66 ++++++++++++++++++++++++-------- cpp/src/parquet/encoding_test.cc | 51 ++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e7e0ea09fcb..61cda96c7e4 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3007,18 +3007,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
::Put; - void Put(const ::arrow::Array& values) override { - AssertBaseBinary(values); - const auto& data = values.data(); - auto src = data->GetValues(1); - - if (values.null_count() == 0) { - Put(src, static_cast(values.length())); - } else { - PutSpaced(src, static_cast(data->length), data->GetValues(0, 0), - data->offset); - } - } + void Put(const ::arrow::Array& values) override; void Put(const T* buffer, int num_values) override; @@ -3036,13 +3025,61 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
+ void PutBinaryArray(const ArrayType& array) { + // TODO: optimize using ArrowPoolVector prefix_lengths(num_values); + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( + *array.data(), + [&](::std::string_view view) { + if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { + return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + } + uint32_t previous_len = 0; + const ByteArray src{view}; + if (ARROW_PREDICT_TRUE(last_value_.empty())) { + last_value_ = view; + suffix_encoder_.Put(&src, 1); + prefix_length_encoder_.Put({static_cast(0)}, 1); + } else { + uint32_t j = 0; + while (j < std::min(previous_len, src.len)) { + if (last_value_[j] != view[j]) { + break; + } + j++; + } + previous_len = j; + prefix_length_encoder_.Put({static_cast(j)}, 1); + + const uint8_t* suffix_ptr = src.ptr + j; + const uint32_t suffix_length = static_cast(src.len - j); + last_value_ = + string_view{reinterpret_cast(suffix_ptr), suffix_length}; + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); + } + return Status::OK(); + }, + []() { return Status::OK(); })); + } + ::arrow::BufferBuilder sink_; DeltaBitPackEncoder prefix_length_encoder_; DeltaLengthByteArrayEncoder suffix_encoder_; string_view last_value_; }; +template +void DeltaByteArrayEncoder::Put(const ::arrow::Array& values) { + AssertBaseBinary(values); + if (::arrow::is_binary_like(values.type_id())) { + PutBinaryArray(checked_cast(values)); + } else { + PutBinaryArray(checked_cast(values)); + } +} + template void DeltaByteArrayEncoder::Put(const T* src, int num_values) { if (num_values == 0) { @@ -3055,12 +3092,11 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { suffix_encoder_.Put(&src[0], 1); prefix_lengths[0] = 0; } - total_value_count_ += num_values; for (int32_t i = 1; i < num_values; i++) { auto prefix = string_view{reinterpret_cast(src[i].ptr), src[i].len}; - size_t j = 0; + uint32_t j = 0; while (j < std::min(src[i - 1].len, src[i].len)) { if (last_value_[j] != prefix[j]) { break; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index da33bb2e6db..27e268ca172 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2041,4 +2041,55 @@ TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, /*null_probability*/ 0.1)); } + +TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const int32_t num_unique = 10; + const double null_probability = 0.25; + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); + + auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) { + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast(values->length() - values->null_count()); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + + typename EncodingTraits::Accumulator acc; + if (::arrow::is_string(values->type()->id())) { + acc.builder = std::make_unique<::arrow::StringBuilder>(); + } else { + acc.builder = std::make_unique<::arrow::BinaryBuilder>(); + } + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ASSERT_EQ(values->length(), result->length()); + ASSERT_OK(result->ValidateFull()); + + auto upcast_result = CastBinaryTypesHelper(result, values->type()); + ::arrow::AssertArraysEqual(*values, *result); + }; + + ::arrow::random::RandomArrayGenerator rag(42); + auto values = rag.String(0, min_length, max_length, null_probability); + CheckSeed(values); + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + rag = ::arrow::random::RandomArrayGenerator(seed); + + values = rag.String(size, min_length, max_length, null_probability); + CheckSeed(values); + + values = + rag.BinaryWithRepeats(size, num_unique, min_length, max_length, null_probability); + CheckSeed(values); + } +} } // namespace parquet::test From 450b0a666d4b71a0c4bb53e4ee4b5f85f3d0dcc9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 Mar 2023 00:57:05 +0100 Subject: [PATCH 03/78] Add FIXED_LEN_BYTE_ARRAY --- cpp/src/parquet/encoding.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 61cda96c7e4..b6f619cf7d7 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3028,7 +3028,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
void PutBinaryArray(const ArrayType& array) { - // TODO: optimize using ArrowPoolVector prefix_lengths(num_values); + // TODO(rok): optimize using ArrowPoolVector prefix_lengths(num_values); PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( *array.data(), [&](::std::string_view view) { @@ -3512,9 +3512,10 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin switch (type_num) { case Type::BYTE_ARRAY: return std::make_unique>(descr, pool); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_unique>(descr, pool); default: throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY"); - break; } } else { ParquetException::NYI("Selected encoding is not supported"); @@ -3567,10 +3568,15 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin "DELTA_BINARY_PACKED decoder only supports INT32 and INT64"); } } else if (encoding == Encoding::DELTA_BYTE_ARRAY) { - if (type_num == Type::BYTE_ARRAY) { - return std::make_unique(descr, pool); + switch (type_num) { + case Type::BYTE_ARRAY: + return std::make_unique(descr, pool); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_unique(descr, pool); + default: + throw ParquetException( + "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"); } - throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY"); } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) { if (type_num == Type::BYTE_ARRAY) { return std::make_unique(descr, pool); From ad7b35f3c19e8d4ce804bf5d956d51afa217735b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Mar 2023 04:44:55 +0100 Subject: [PATCH 04/78] More FLBAType work --- cpp/src/parquet/encoding.cc | 153 ++++++++++++++++++++++++++++--- cpp/src/parquet/encoding_test.cc | 36 ++++++++ 2 files changed, 178 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b6f619cf7d7..cd4b956895a 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1215,6 +1215,35 @@ struct ArrowBinaryHelper { int64_t chunk_space_remaining; }; +struct ArrowFLBAHelper { + explicit ArrowFLBAHelper(::arrow::FixedSizeBinaryBuilder* builder) { + this->builder = builder; + this->chunk_space_remaining = + ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); + } + + Status PushChunk() { + std::shared_ptr<::arrow::Array> result; + RETURN_NOT_OK(builder->Finish(&result)); + chunks.push_back(result); + chunk_space_remaining = ::arrow::kBinaryMemoryLimit; + return Status::OK(); + } + + bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } + + Status Append(const uint8_t* data, int32_t length) { + chunk_space_remaining -= length; + return builder->Append(data); + } + + Status AppendNull() { return builder->AppendNull(); } + + ::arrow::FixedSizeBinaryBuilder* builder; + std::vector> chunks; + int64_t chunk_space_remaining; +}; + template <> inline int PlainDecoder::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, @@ -3026,6 +3055,42 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_lengths(num_values); + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::FixedSizeBinaryType>( + *array.data(), + [&](::std::string_view view) { + uint32_t previous_len = 0; + const ByteArray src{view}; + if (ARROW_PREDICT_TRUE(last_value_.empty())) { + last_value_ = view; + suffix_encoder_.Put(&src, 1); + prefix_length_encoder_.Put({static_cast(0)}, 1); + } else { + uint32_t j = 0; + while (j < std::min(previous_len, byte_width)) { + if (last_value_[j] != view[j]) { + break; + } + j++; + } + previous_len = j; + prefix_length_encoder_.Put({static_cast(j)}, 1); + + const uint8_t* suffix_ptr = src.ptr + j; + const uint32_t suffix_length = static_cast(byte_width - j); + last_value_ = + string_view{reinterpret_cast(suffix_ptr), suffix_length}; + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); + } + return Status::OK(); + }, + []() { return Status::OK(); })); + } + template void PutBinaryArray(const ArrayType& array) { // TODO(rok): optimize using ArrowPoolVector prefix_lengths(num_values); @@ -3070,12 +3135,33 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
+inline void DeltaByteArrayEncoder::Put(const FixedLenByteArray* src, + int num_values) { + if (descr_->type_length() == 0) { + return; + } + for (int i = 0; i < num_values; ++i) { + // Write the result to the output stream + DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL"; + PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length())); + } +} + +template <> +void DeltaByteArrayEncoder::Put(const ::arrow::Array& values) { + if (!::arrow::is_fixed_size_binary(values.type_id())) { + throw ParquetException("Only FixedSizeBinaryArray and subclasses supported"); + } + PutFixedLenByteArray(checked_cast(values)); +} + template void DeltaByteArrayEncoder::Put(const ::arrow::Array& values) { AssertBaseBinary(values); if (::arrow::is_binary_like(values.type_id())) { PutBinaryArray(checked_cast(values)); - } else { + } else if (::arrow::is_large_binary_like(values.type_id())) { PutBinaryArray(checked_cast(values)); } } @@ -3132,8 +3218,10 @@ std::shared_ptr DeltaByteArrayEncoder::FlushValues() { // ---------------------------------------------------------------------- // DeltaByteArrayDecoder -class DeltaByteArrayDecoder : public DecoderImpl, - virtual public TypedDecoder { +template +class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder { + using T = typename DType::c_type; + public: explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool()) @@ -3172,23 +3260,22 @@ class DeltaByteArrayDecoder : public DecoderImpl, last_value_ = ""; } - int Decode(ByteArray* buffer, int max_values) override { + int Decode(ByteArray* buffer, int max_values) { return GetInternal(buffer, max_values); } int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { + typename EncodingTraits::Accumulator* out) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); return result; } - int DecodeArrow( - int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::DictAccumulator* builder) override { + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { ParquetException::NYI("DecodeArrow of DictAccumulator for DeltaByteArrayDecoder"); } @@ -3248,6 +3335,41 @@ class DeltaByteArrayDecoder : public DecoderImpl, return max_values; } + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out, + int* out_num_values) { + ArrowFLBAHelper helper(out); + + std::vector values(num_values); + const int num_valid_values = GetInternal(values.data(), num_values - null_count); + DCHECK_EQ(num_values - null_count, num_valid_values); + + auto values_ptr = reinterpret_cast(values.data()); + int value_idx = 0; + + RETURN_NOT_OK(VisitNullBitmapInline( + valid_bits, valid_bits_offset, num_values, null_count, + [&]() { + const auto& val = values_ptr[value_idx]; + if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { + RETURN_NOT_OK(helper.PushChunk()); + } + RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); + ++value_idx; + return Status::OK(); + }, + [&]() { + RETURN_NOT_OK(helper.AppendNull()); + --null_count; + return Status::OK(); + })); + + DCHECK_EQ(null_count, 0); + *out_num_values = num_valid_values; + return Status::OK(); + } + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, @@ -3295,6 +3417,15 @@ class DeltaByteArrayDecoder : public DecoderImpl, std::shared_ptr buffered_data_; }; +class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoder, + virtual public FLBADecoder { + public: + using Base = DeltaByteArrayDecoder; + using Base::DeltaByteArrayDecoder; + + int Decode(FixedLenByteArray* buffer, int max_values) override { return 0; }; +}; + // ---------------------------------------------------------------------- // BYTE_STREAM_SPLIT @@ -3513,7 +3644,7 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin case Type::BYTE_ARRAY: return std::make_unique>(descr, pool); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_unique>(descr, pool); + return std::make_unique>(descr, pool); default: throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY"); } @@ -3572,7 +3703,7 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin case Type::BYTE_ARRAY: return std::make_unique(descr, pool); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_unique(descr, pool); + return std::make_unique(descr, pool); default: throw ParquetException( "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"); diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 27e268ca172..f8ed97121cb 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2092,4 +2092,40 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { CheckSeed(values); } } + +TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPutFixedLength) { + const int64_t size = 50; + const double null_probability = 0.25; + ::arrow::random::RandomArrayGenerator rag(0); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); + + auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) { + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast(values->length() - values->null_count()); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + + typename EncodingTraits::Accumulator acc(values->type()); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.Finish(&result)); + ASSERT_EQ(values->length(), result->length()); + ASSERT_OK(result->ValidateFull()); + ::arrow::AssertArraysEqual(*values, *result); + }; + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + for (auto length : {0, 10, 100, 1000}) { + rag = ::arrow::random::RandomArrayGenerator(seed); + auto values = rag.FixedSizeBinary(size, length, null_probability); + CheckSeed(values); + } + } +} } // namespace parquet::test From 166ecf94d1f5a21324d78b820a7064ca8417d235 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 21 Mar 2023 21:54:01 +0100 Subject: [PATCH 05/78] Review feedback --- cpp/src/parquet/encoding.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index cd4b956895a..56b3433252b 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3020,9 +3020,9 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
MakeEncoder(Type::type type_num, Encoding::type encodin case Type::FIXED_LEN_BYTE_ARRAY: return std::make_unique>(descr, pool); default: - throw ParquetException("DELTA_BYTE_ARRAY only supports BYTE_ARRAY"); + throw ParquetException( + "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"); } } else { ParquetException::NYI("Selected encoding is not supported"); From d365c7d0112f7b8b11fabe39d7a1e4fc53bf881a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 22 Mar 2023 03:35:41 +0100 Subject: [PATCH 06/78] Review feedback --- cpp/src/parquet/encoding.cc | 13 +++++++------ cpp/src/parquet/encoding_test.cc | 2 ++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 56b3433252b..e3b38f6b5d9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3057,20 +3057,20 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_lengths(num_values); PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::FixedSizeBinaryType>( *array.data(), [&](::std::string_view view) { - uint32_t previous_len = 0; const ByteArray src{view}; if (ARROW_PREDICT_TRUE(last_value_.empty())) { last_value_ = view; suffix_encoder_.Put(&src, 1); prefix_length_encoder_.Put({static_cast(0)}, 1); + previous_len = byte_width; } else { uint32_t j = 0; - while (j < std::min(previous_len, byte_width)) { + while (j < previous_len) { if (last_value_[j] != view[j]) { break; } @@ -3093,19 +3093,20 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
void PutBinaryArray(const ArrayType& array) { - // TODO(rok): optimize using ArrowPoolVector prefix_lengths(num_values); + uint32_t previous_len = 0; + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( *array.data(), [&](::std::string_view view) { - if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { + if (ARROW_PREDICT_TRUE(view.size() > kMaxByteArraySize)) { return Status::Invalid("Parquet cannot store strings with size 2GB or more"); } - uint32_t previous_len = 0; const ByteArray src{view}; if (ARROW_PREDICT_TRUE(last_value_.empty())) { last_value_ = view; suffix_encoder_.Put(&src, 1); prefix_length_encoder_.Put({static_cast(0)}, 1); + previous_len = src.len; } else { uint32_t j = 0; while (j < std::min(previous_len, src.len)) { diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index f8ed97121cb..fb2911f807b 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2030,6 +2030,8 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { typedef ::testing::Types TestDeltaByteArrayEncodingTypes; TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); +// TODO: add FLBAType and Decimal type tests + TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); ASSERT_NO_FATAL_FAILURE(this->Execute(250, 2)); From c4e2226aa364fee2a2becb69fd1dadf0df2dc25e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 23 Mar 2023 00:14:58 +0100 Subject: [PATCH 07/78] DeltaByteArrayDecoderImpl --- cpp/src/parquet/encoding.cc | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e3b38f6b5d9..ed4f173231a 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3220,12 +3220,12 @@ std::shared_ptr DeltaByteArrayEncoder::FlushValues() { // DeltaByteArrayDecoder template -class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder { +class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecoder { using T = typename DType::c_type; public: - explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr, - MemoryPool* pool = ::arrow::default_memory_pool()) + explicit DeltaByteArrayDecoderImpl(const ColumnDescriptor* descr, + MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), prefix_len_decoder_(nullptr, pool), suffix_decoder_(nullptr, pool), @@ -3261,10 +3261,6 @@ class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder
::Accumulator* out) override { @@ -3280,7 +3276,7 @@ class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder
decoder_; DeltaBitPackDecoder prefix_len_decoder_; DeltaLengthByteArrayDecoder suffix_decoder_; @@ -3418,13 +3415,28 @@ class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder
buffered_data_; }; -class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoder, +class DeltaByteArrayDecoder : public DeltaByteArrayDecoderImpl { + public: + using Base = DeltaByteArrayDecoderImpl; + using Base::DeltaByteArrayDecoderImpl; + + int Decode(ByteArray* buffer, int max_values) override { + return GetInternal(buffer, max_values); + } +}; + +class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl, virtual public FLBADecoder { public: - using Base = DeltaByteArrayDecoder; - using Base::DeltaByteArrayDecoder; + using Base = DeltaByteArrayDecoderImpl; + using Base::DeltaByteArrayDecoderImpl; - int Decode(FixedLenByteArray* buffer, int max_values) override { return 0; }; + int Decode(ByteArray* buffer, int max_values) { + return GetInternal(buffer, max_values); + } + int Decode(FixedLenByteArray* buffer, int max_values) override { + throw ParquetException("Cannot decode DeltaByteArray from FixedLenByteArray"); + } }; // ---------------------------------------------------------------------- From d1fbd2135fefc0c3e3c0ea901df71771b1fdcd68 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 23 Mar 2023 02:40:47 +0100 Subject: [PATCH 08/78] Add Python test for FLBA and boolean with RLE --- python/pyarrow/tests/parquet/test_basic.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 2d983325297..0f5a6432aab 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -392,9 +392,12 @@ def test_byte_stream_split(use_legacy_dataset): def test_column_encoding(use_legacy_dataset): arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) - arr_bin = pa.array([str(x) for x in range(100)]) - mixed_table = pa.Table.from_arrays([arr_float, arr_int, arr_bin], - names=['a', 'b', 'c']) + arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary()) + arr_flba = pa.array([str(x) for x in range(100)], type=pa.binary(10)) + arr_bool = pa.array([False, True, False, False] * 25) + mixed_table = pa.Table.from_arrays( + [arr_float, arr_int, arr_bin, arr_flba, arr_bool], + names=['a', 'b', 'c', 'd', 'e']) # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for # column 'b' and 'c'. @@ -431,7 +434,14 @@ def test_column_encoding(use_legacy_dataset): use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "DELTA_BYTE_ARRAY"}, + 'c': "DELTA_BYTE_ARRAY", + 'd': "DELTA_BYTE_ARRAY"}, + use_legacy_dataset=use_legacy_dataset) + + # Check "RLE" for boolean columns. + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'e': "RLE"}, use_legacy_dataset=use_legacy_dataset) # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. From ad63efc0cbe47ec2e8b277879579d7c3218924b5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 23 Mar 2023 14:32:26 +0100 Subject: [PATCH 09/78] Review feedback --- cpp/src/parquet/encoding.cc | 45 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index ed4f173231a..56c9691115f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1197,6 +1197,7 @@ struct ArrowBinaryHelper { bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } void UnsafeAppend(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); chunk_space_remaining -= length; builder->UnsafeAppend(data, length); } @@ -1225,7 +1226,7 @@ struct ArrowFLBAHelper { Status PushChunk() { std::shared_ptr<::arrow::Array> result; RETURN_NOT_OK(builder->Finish(&result)); - chunks.push_back(result); + chunks.push_back(std::move(result)); chunk_space_remaining = ::arrow::kBinaryMemoryLimit; return Status::OK(); } @@ -1233,6 +1234,7 @@ struct ArrowFLBAHelper { bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } Status Append(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); chunk_space_remaining -= length; return builder->Append(data); } @@ -3058,20 +3060,22 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
( *array.data(), [&](::std::string_view view) { + // Convert view to ByteArray so it can be passed to the suffix_encoder_. const ByteArray src{view}; - if (ARROW_PREDICT_TRUE(last_value_.empty())) { - last_value_ = view; + if (last_value_view.empty()) { + last_value_view = view; suffix_encoder_.Put(&src, 1); prefix_length_encoder_.Put({static_cast(0)}, 1); previous_len = byte_width; } else { uint32_t j = 0; while (j < previous_len) { - if (last_value_[j] != view[j]) { + if (last_value_view[j] != view[j]) { break; } j++; @@ -3081,36 +3085,39 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(byte_width - j); - last_value_ = + last_value_view = string_view{reinterpret_cast(suffix_ptr), suffix_length}; + // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); } return Status::OK(); }, []() { return Status::OK(); })); + last_value_ = last_value_view; } template void PutBinaryArray(const ArrayType& array) { uint32_t previous_len = 0; + std::string_view last_value_view = last_value_; PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( *array.data(), [&](::std::string_view view) { - if (ARROW_PREDICT_TRUE(view.size() > kMaxByteArraySize)) { + if (ARROW_PREDICT_FALSE(view.size() >= kMaxByteArraySize)) { return Status::Invalid("Parquet cannot store strings with size 2GB or more"); } const ByteArray src{view}; - if (ARROW_PREDICT_TRUE(last_value_.empty())) { - last_value_ = view; + if (last_value_view.empty()) { + last_value_view = view; suffix_encoder_.Put(&src, 1); prefix_length_encoder_.Put({static_cast(0)}, 1); previous_len = src.len; } else { uint32_t j = 0; while (j < std::min(previous_len, src.len)) { - if (last_value_[j] != view[j]) { + if (last_value_view[j] != view[j]) { break; } j++; @@ -3120,7 +3127,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(src.len - j); - last_value_ = + last_value_view = string_view{reinterpret_cast(suffix_ptr), suffix_length}; const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3128,12 +3135,13 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_length_encoder_; DeltaLengthByteArrayEncoder suffix_encoder_; - string_view last_value_; + std::string last_value_; }; template <> @@ -3142,6 +3150,7 @@ inline void DeltaByteArrayEncoder::Put(const FixedLenByteArray* src, if (descr_->type_length() == 0) { return; } + // TODO: This is a temporary solution. for (int i = 0; i < num_values; ++i) { // Write the result to the output stream DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL"; @@ -3172,10 +3181,12 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { if (num_values == 0) { return; } - ArrowPoolVector prefix_lengths(num_values); + ArrowPoolVector prefix_lengths(num_values, + ::arrow::stl::allocator(pool_)); + std::string_view last_value_view = last_value_; - if (ARROW_PREDICT_TRUE(last_value_.empty())) { - last_value_ = string_view{reinterpret_cast(src[0].ptr), src[0].len}; + if (last_value_view.empty()) { + last_value_view = string_view{reinterpret_cast(src[0].ptr), src[0].len}; suffix_encoder_.Put(&src[0], 1); prefix_lengths[0] = 0; } @@ -3185,7 +3196,7 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { uint32_t j = 0; while (j < std::min(src[i - 1].len, src[i].len)) { - if (last_value_[j] != prefix[j]) { + if (last_value_view[j] != prefix[j]) { break; } j++; @@ -3194,11 +3205,13 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { prefix_lengths[i] = j; const uint8_t* suffix_ptr = src[i].ptr + j; const uint32_t suffix_length = static_cast(src[i].len - j); - last_value_ = string_view{reinterpret_cast(suffix_ptr), suffix_length}; + last_value_view = + string_view{reinterpret_cast(suffix_ptr), suffix_length}; const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); } prefix_length_encoder_.Put(prefix_lengths.data(), num_values); + last_value_ = last_value_view; } template From 665048ff8a66f2cebb5c7e93dd9176f890ae4b16 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 23 Mar 2023 17:06:45 +0100 Subject: [PATCH 10/78] Work --- cpp/src/parquet/encoding.cc | 7 ++++++- python/pyarrow/tests/parquet/test_basic.py | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 56c9691115f..a0f320a6f11 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3027,7 +3027,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
FlushValues() override; @@ -3062,6 +3063,10 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { + throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); + } + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::FixedSizeBinaryType>( *array.data(), [&](::std::string_view view) { diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 0f5a6432aab..dd12a266165 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -393,7 +393,8 @@ def test_column_encoding(use_legacy_dataset): arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary()) - arr_flba = pa.array([str(x) for x in range(100)], type=pa.binary(10)) + arr_flba = pa.array( + [str(x).zfill(10) for x in range(100)], type=pa.binary(10)) arr_bool = pa.array([False, True, False, False] * 25) mixed_table = pa.Table.from_arrays( [arr_float, arr_int, arr_bin, arr_flba, arr_bool], From 4c9b90f46c77b0abf780e8e7a4ddb234e5762137 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 24 Mar 2023 02:23:42 +0100 Subject: [PATCH 11/78] Review feedback --- cpp/src/parquet/encoding.cc | 41 ++++++++++++++++++---- python/pyarrow/tests/parquet/test_basic.py | 4 +-- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a0f320a6f11..10cb9290dd6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1189,7 +1189,7 @@ struct ArrowBinaryHelper { Status PushChunk() { std::shared_ptr<::arrow::Array> result; RETURN_NOT_OK(builder->Finish(&result)); - out->chunks.push_back(result); + out->chunks.push_back(std::move(result)); chunk_space_remaining = ::arrow::kBinaryMemoryLimit; return Status::OK(); } @@ -3152,15 +3152,42 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
inline void DeltaByteArrayEncoder::Put(const FixedLenByteArray* src, int num_values) { - if (descr_->type_length() == 0) { + if (num_values == 0) { return; } - // TODO: This is a temporary solution. - for (int i = 0; i < num_values; ++i) { - // Write the result to the output stream - DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL"; - PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length())); + const uint32_t byte_width = sizeof(src[0]); + ArrowPoolVector prefix_lengths(num_values, + ::arrow::stl::allocator(pool_)); + std::string_view last_value_view = last_value_; + + if (last_value_view.empty()) { + last_value_view = string_view{reinterpret_cast(src[0].ptr), byte_width}; + const ByteArray value{byte_width, src[0].ptr}; + suffix_encoder_.Put(&value, 1); + prefix_lengths[0] = 0; } + + for (int32_t i = 1; i < num_values; i++) { + auto prefix = string_view{reinterpret_cast(src[i].ptr), byte_width}; + + uint32_t j = 0; + while (j < byte_width) { + if (last_value_view[j] != prefix[j]) { + break; + } + j++; + } + + prefix_lengths[i] = j; + const uint8_t* suffix_ptr = src[i].ptr + j; + const uint32_t suffix_length = static_cast(byte_width - j); + last_value_view = + string_view{reinterpret_cast(suffix_ptr), suffix_length}; + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); + } + prefix_length_encoder_.Put(prefix_lengths.data(), num_values); + last_value_ = last_value_view; } template <> diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index dd12a266165..78ba2d94b82 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -431,12 +431,12 @@ def test_column_encoding(use_legacy_dataset): use_legacy_dataset=use_legacy_dataset) # Check "DELTA_BYTE_ARRAY" for byte columns. + # TODO: 'd': "DELTA_BYTE_ARRAY" _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "DELTA_BYTE_ARRAY", - 'd': "DELTA_BYTE_ARRAY"}, + 'c': "DELTA_BYTE_ARRAY"}, use_legacy_dataset=use_legacy_dataset) # Check "RLE" for boolean columns. From 23d663cf2c60269c8198dfa445031c272736bae7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 25 Mar 2023 04:17:13 +0100 Subject: [PATCH 12/78] Review feedback --- cpp/src/parquet/encoding.cc | 142 ++++++++---------------------------- 1 file changed, 29 insertions(+), 113 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 10cb9290dd6..7b0a1237225 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3008,11 +3008,13 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { // ---------------------------------------------------------------------- // DELTA_BYTE_ARRAY -// This is also known as incremental encoding or front compression: for each element in a -// sequence of strings, store the prefix length of the previous entry plus the suffix. -// -// This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED), -// followed by the suffixes encoded as delta length byte arrays (DELTA_LENGTH_BYTE_ARRAY). +/// Delta Byte Array encoding also known as incremental encoding or front compression: +/// for each element in a sequence of strings, store the prefix length of the previous +/// entry plus the suffix. +/// +/// This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED), +/// followed by the suffixes encoded as delta length byte arrays +/// (DELTA_LENGTH_BYTE_ARRAY). // ---------------------------------------------------------------------- // DeltaByteArrayEncoder @@ -3058,50 +3060,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { - throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); - } - - PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::FixedSizeBinaryType>( - *array.data(), - [&](::std::string_view view) { - // Convert view to ByteArray so it can be passed to the suffix_encoder_. - const ByteArray src{view}; - if (last_value_view.empty()) { - last_value_view = view; - suffix_encoder_.Put(&src, 1); - prefix_length_encoder_.Put({static_cast(0)}, 1); - previous_len = byte_width; - } else { - uint32_t j = 0; - while (j < previous_len) { - if (last_value_view[j] != view[j]) { - break; - } - j++; - } - previous_len = j; - prefix_length_encoder_.Put({static_cast(j)}, 1); - - const uint8_t* suffix_ptr = src.ptr + j; - const uint32_t suffix_length = static_cast(byte_width - j); - last_value_view = - string_view{reinterpret_cast(suffix_ptr), suffix_length}; - // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. - const ByteArray suffix(suffix_length, suffix_ptr); - suffix_encoder_.Put(&suffix, 1); - } - return Status::OK(); - }, - []() { return Status::OK(); })); - last_value_ = last_value_view; - } - template void PutBinaryArray(const ArrayType& array) { uint32_t previous_len = 0; @@ -3113,6 +3071,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { return Status::Invalid("Parquet cannot store strings with size 2GB or more"); } + // Convert view to ByteArray so it can be passed to the suffix_encoder_. const ByteArray src{view}; if (last_value_view.empty()) { last_value_view = view; @@ -3134,6 +3093,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(src.len - j); last_value_view = string_view{reinterpret_cast(suffix_ptr), suffix_length}; + // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); } @@ -3149,103 +3109,59 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
-inline void DeltaByteArrayEncoder::Put(const FixedLenByteArray* src, - int num_values) { +template +void DeltaByteArrayEncoder::Put(const T* src, int num_values) { if (num_values == 0) { return; } - const uint32_t byte_width = sizeof(src[0]); ArrowPoolVector prefix_lengths(num_values, ::arrow::stl::allocator(pool_)); std::string_view last_value_view = last_value_; - if (last_value_view.empty()) { - last_value_view = string_view{reinterpret_cast(src[0].ptr), byte_width}; - const ByteArray value{byte_width, src[0].ptr}; - suffix_encoder_.Put(&value, 1); - prefix_lengths[0] = 0; - } - - for (int32_t i = 1; i < num_values; i++) { - auto prefix = string_view{reinterpret_cast(src[i].ptr), byte_width}; + int i = 0; + while (i < num_values) { + // Convert to ByteArray so we can pass to the suffix_encoder_. + auto value = reinterpret_cast(&src[i]); + if (ARROW_PREDICT_FALSE(value->len >= kMaxByteArraySize)) { + throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); + } + auto view = string_view{reinterpret_cast(value->ptr), value->len}; uint32_t j = 0; - while (j < byte_width) { - if (last_value_view[j] != prefix[j]) { + while (j < std::min(value->len, static_cast(last_value_view.length()))) { + if (last_value_view[j] != view[j]) { break; } j++; } prefix_lengths[i] = j; - const uint8_t* suffix_ptr = src[i].ptr + j; - const uint32_t suffix_length = static_cast(byte_width - j); + const uint8_t* suffix_ptr = value->ptr + j; + const uint32_t suffix_length = static_cast(value->len - j); last_value_view = string_view{reinterpret_cast(suffix_ptr), suffix_length}; + // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); + i++; } prefix_length_encoder_.Put(prefix_lengths.data(), num_values); last_value_ = last_value_view; } -template <> -void DeltaByteArrayEncoder::Put(const ::arrow::Array& values) { - if (!::arrow::is_fixed_size_binary(values.type_id())) { - throw ParquetException("Only FixedSizeBinaryArray and subclasses supported"); - } - PutFixedLenByteArray(checked_cast(values)); -} - template void DeltaByteArrayEncoder::Put(const ::arrow::Array& values) { - AssertBaseBinary(values); if (::arrow::is_binary_like(values.type_id())) { PutBinaryArray(checked_cast(values)); } else if (::arrow::is_large_binary_like(values.type_id())) { PutBinaryArray(checked_cast(values)); + } else if (::arrow::is_fixed_size_binary(values.type_id())) { + PutBinaryArray(checked_cast(values)); + } else { + throw ParquetException("Only BaseBinaryArray and subclasses supported"); } } -template -void DeltaByteArrayEncoder::Put(const T* src, int num_values) { - if (num_values == 0) { - return; - } - ArrowPoolVector prefix_lengths(num_values, - ::arrow::stl::allocator(pool_)); - std::string_view last_value_view = last_value_; - - if (last_value_view.empty()) { - last_value_view = string_view{reinterpret_cast(src[0].ptr), src[0].len}; - suffix_encoder_.Put(&src[0], 1); - prefix_lengths[0] = 0; - } - - for (int32_t i = 1; i < num_values; i++) { - auto prefix = string_view{reinterpret_cast(src[i].ptr), src[i].len}; - - uint32_t j = 0; - while (j < std::min(src[i - 1].len, src[i].len)) { - if (last_value_view[j] != prefix[j]) { - break; - } - j++; - } - - prefix_lengths[i] = j; - const uint8_t* suffix_ptr = src[i].ptr + j; - const uint32_t suffix_length = static_cast(src[i].len - j); - last_value_view = - string_view{reinterpret_cast(suffix_ptr), suffix_length}; - const ByteArray suffix(suffix_length, suffix_ptr); - suffix_encoder_.Put(&suffix, 1); - } - prefix_length_encoder_.Put(prefix_lengths.data(), num_values); - last_value_ = last_value_view; -} - template std::shared_ptr DeltaByteArrayEncoder::FlushValues() { PARQUET_THROW_NOT_OK(sink_.Resize(EstimatedDataEncodedSize(), false)); From c6408f559e26a3ccacbb0efb4385680433a403f7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 25 Mar 2023 16:21:15 +0100 Subject: [PATCH 13/78] Refactoring --- cpp/src/parquet/encoding.cc | 97 ++++++++++++------------------------- 1 file changed, 31 insertions(+), 66 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 7b0a1237225..26fc43108ef 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1178,7 +1178,11 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { return max_values; } -struct ArrowBinaryHelper { +template +struct ArrowBinaryHelper; + +template <> +struct ArrowBinaryHelper { explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* out) { this->out = out; this->builder = out->builder.get(); @@ -1216,8 +1220,9 @@ struct ArrowBinaryHelper { int64_t chunk_space_remaining; }; -struct ArrowFLBAHelper { - explicit ArrowFLBAHelper(::arrow::FixedSizeBinaryBuilder* builder) { +template <> +struct ArrowBinaryHelper { + explicit ArrowBinaryHelper(EncodingTraits::Accumulator* builder) { this->builder = builder; this->chunk_space_remaining = ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); @@ -1344,7 +1349,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, int* out_values_decoded) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out); int values_decoded = 0; RETURN_NOT_OK(helper.builder->Reserve(num_values)); @@ -1865,7 +1870,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, constexpr int32_t kBufferSize = 1024; int32_t indices[kBufferSize]; - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; @@ -1934,7 +1939,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; int values_decoded = 0; - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out); auto dict_values = reinterpret_cast(dictionary_->data()); while (values_decoded < num_values) { @@ -2777,7 +2782,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, int* out_num_values) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out); std::vector values(num_values - null_count); const int num_valid_values = Decode(values.data(), num_values - null_count); @@ -3073,30 +3078,25 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(0)}, 1); - previous_len = src.len; - } else { - uint32_t j = 0; - while (j < std::min(previous_len, src.len)) { - if (last_value_view[j] != view[j]) { - break; - } - j++; + + uint32_t j = 0; + while (j < std::min(previous_len, src.len)) { + if (last_value_view[j] != view[j]) { + break; } - previous_len = j; - prefix_length_encoder_.Put({static_cast(j)}, 1); - - const uint8_t* suffix_ptr = src.ptr + j; - const uint32_t suffix_length = static_cast(src.len - j); - last_value_view = - string_view{reinterpret_cast(suffix_ptr), suffix_length}; - // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. - const ByteArray suffix(suffix_length, suffix_ptr); - suffix_encoder_.Put(&suffix, 1); + j++; } + previous_len = j; + prefix_length_encoder_.Put({static_cast(j)}, 1); + + const uint8_t* suffix_ptr = src.ptr + j; + const uint32_t suffix_length = static_cast(src.len - j); + last_value_view = + string_view{reinterpret_cast(suffix_ptr), suffix_length}; + // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); + return Status::OK(); }, []() { return Status::OK(); })); @@ -3295,44 +3295,9 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, - int* out_num_values) { - ArrowFLBAHelper helper(out); - - std::vector values(num_values); - const int num_valid_values = GetInternal(values.data(), num_values - null_count); - DCHECK_EQ(num_values - null_count, num_valid_values); - - auto values_ptr = reinterpret_cast(values.data()); - int value_idx = 0; - - RETURN_NOT_OK(VisitNullBitmapInline( - valid_bits, valid_bits_offset, num_values, null_count, - [&]() { - const auto& val = values_ptr[value_idx]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); - } - RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); - ++value_idx; - return Status::OK(); - }, - [&]() { - RETURN_NOT_OK(helper.AppendNull()); - --null_count; - return Status::OK(); - })); - - DCHECK_EQ(null_count, 0); - *out_num_values = num_valid_values; - return Status::OK(); - } - - Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, + typename EncodingTraits::Accumulator* out, int* out_num_values) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out); std::vector values(num_values); const int num_valid_values = GetInternal(values.data(), num_values - null_count); From 54053f9e8c271441879a787c8497a08c0a164bdd Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Mar 2023 01:32:36 +0200 Subject: [PATCH 14/78] Apply suggestions from code review Co-authored-by: mwish <1506118561@qq.com> --- cpp/src/parquet/encoding.cc | 11 +++++----- cpp/src/parquet/encoding_test.cc | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 26fc43108ef..67942859f48 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3067,7 +3067,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
void PutBinaryArray(const ArrayType& array) { - uint32_t previous_len = 0; + uint32_t previous_len = static_cast(last_value_.size()); std::string_view last_value_view = last_value_; PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( @@ -3086,13 +3086,12 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(j)}, 1); const uint8_t* suffix_ptr = src.ptr + j; const uint32_t suffix_length = static_cast(src.len - j); - last_value_view = - string_view{reinterpret_cast(suffix_ptr), suffix_length}; + last_value_view = view; // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3138,8 +3137,7 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { prefix_lengths[i] = j; const uint8_t* suffix_ptr = value->ptr + j; const uint32_t suffix_length = static_cast(value->len - j); - last_value_view = - string_view{reinterpret_cast(suffix_ptr), suffix_length}; + last_value_view = view; // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3174,6 +3172,7 @@ std::shared_ptr DeltaByteArrayEncoder::FlushValues() { std::shared_ptr buffer; PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true)); + last_value_.clear(); return buffer; } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index fb2911f807b..e7f6ae7acf5 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2130,4 +2130,41 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPutFixedLength) { } } } + +TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { + auto CheckEncode = [](std::shared_ptr<::arrow::Array> values, + std::shared_ptr<::arrow::Array> prefix_lengths, + std::shared_ptr<::arrow::Array> suffix_lengths, std::string_view value) { + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + auto prefix_lengths_encoder = MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); + ASSERT_NO_THROW(prefix_lengths_encoder->Put(*prefix_lengths)); + auto prefix_lengths_buf = prefix_lengths_encoder->FlushValues(); + + auto encoded_prefix_lengths_buf = SliceBuffer(buf, 0, prefix_lengths_buf->size()); + + auto suffix_lengths_encoder = MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); + ASSERT_NO_THROW(suffix_lengths_encoder->Put(*suffix_lengths)); + auto suffix_lengths_buf = suffix_lengths_encoder->FlushValues(); + + auto encoded_values_buf = SliceBuffer(buf, prefix_lengths_buf->size() + suffix_lengths_buf->size()); + + auto encoded_prefix_length_buf = SliceBuffer(buf, 0, prefix_lengths_buf->size()); + EXPECT_TRUE(prefix_lengths_buf->Equals(*encoded_prefix_length_buf)); + auto encoded_suffix_length_buf = SliceBuffer(buf, prefix_lengths_buf->size(), suffix_lengths_buf->size()); + EXPECT_TRUE(suffix_lengths_buf->Equals(*encoded_suffix_length_buf)); + EXPECT_EQ(value, encoded_values_buf->ToString()); + }; + + auto values = R"(["axis", "axle", "babble", "babyhood"])"; + auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 2, 0, 3])"); + auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([4, 2, 6, 5])"); + + CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); + CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); +} } // namespace parquet::test From e88c838292ca582112a08bc84df8e85f3bda6034 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 28 Mar 2023 01:45:11 +0200 Subject: [PATCH 15/78] Linting and adding a python flba test --- cpp/src/parquet/encoding_test.cc | 27 ++++++++++++++-------- python/pyarrow/tests/parquet/test_basic.py | 4 ++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index e7f6ae7acf5..a9abdc9a2b0 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2134,26 +2134,31 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPutFixedLength) { TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto CheckEncode = [](std::shared_ptr<::arrow::Array> values, std::shared_ptr<::arrow::Array> prefix_lengths, - std::shared_ptr<::arrow::Array> suffix_lengths, std::string_view value) { + std::shared_ptr<::arrow::Array> suffix_lengths, + std::string_view value) { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); ASSERT_NO_THROW(encoder->Put(*values)); auto buf = encoder->FlushValues(); - auto prefix_lengths_encoder = MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); + auto prefix_lengths_encoder = + MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); ASSERT_NO_THROW(prefix_lengths_encoder->Put(*prefix_lengths)); auto prefix_lengths_buf = prefix_lengths_encoder->FlushValues(); auto encoded_prefix_lengths_buf = SliceBuffer(buf, 0, prefix_lengths_buf->size()); - auto suffix_lengths_encoder = MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); + auto suffix_lengths_encoder = + MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); ASSERT_NO_THROW(suffix_lengths_encoder->Put(*suffix_lengths)); auto suffix_lengths_buf = suffix_lengths_encoder->FlushValues(); - auto encoded_values_buf = SliceBuffer(buf, prefix_lengths_buf->size() + suffix_lengths_buf->size()); + auto encoded_values_buf = + SliceBuffer(buf, prefix_lengths_buf->size() + suffix_lengths_buf->size()); auto encoded_prefix_length_buf = SliceBuffer(buf, 0, prefix_lengths_buf->size()); EXPECT_TRUE(prefix_lengths_buf->Equals(*encoded_prefix_length_buf)); - auto encoded_suffix_length_buf = SliceBuffer(buf, prefix_lengths_buf->size(), suffix_lengths_buf->size()); + auto encoded_suffix_length_buf = + SliceBuffer(buf, prefix_lengths_buf->size(), suffix_lengths_buf->size()); EXPECT_TRUE(suffix_lengths_buf->Equals(*encoded_suffix_length_buf)); EXPECT_EQ(value, encoded_values_buf->ToString()); }; @@ -2162,9 +2167,13 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 2, 0, 3])"); auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([4, 2, 6, 5])"); - CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); - CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), prefix_lengths, suffix_lengths, "axislebabbleyhood"); + CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), prefix_lengths, + suffix_lengths, "axislebabbleyhood"); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), prefix_lengths, + suffix_lengths, "axislebabbleyhood"); + CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), prefix_lengths, + suffix_lengths, "axislebabbleyhood"); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), prefix_lengths, + suffix_lengths, "axislebabbleyhood"); } } // namespace parquet::test diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 78ba2d94b82..dd12a266165 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -431,12 +431,12 @@ def test_column_encoding(use_legacy_dataset): use_legacy_dataset=use_legacy_dataset) # Check "DELTA_BYTE_ARRAY" for byte columns. - # TODO: 'd': "DELTA_BYTE_ARRAY" _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "DELTA_BYTE_ARRAY"}, + 'c': "DELTA_BYTE_ARRAY", + 'd': "DELTA_BYTE_ARRAY"}, use_legacy_dataset=use_legacy_dataset) # Check "RLE" for boolean columns. From c74e3f2780596e91fac3a063c6ee1f26e4f88ea9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 02:29:58 +0200 Subject: [PATCH 16/78] Update cpp/src/parquet/encoding.cc Co-authored-by: Will Jones --- cpp/src/parquet/encoding.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 67942859f48..8d08cfb6b57 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3127,7 +3127,8 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { auto view = string_view{reinterpret_cast(value->ptr), value->len}; uint32_t j = 0; - while (j < std::min(value->len, static_cast(last_value_view.length()))) { + int32_t common_length = std::min(value->len, static_cast(last_value_view.length())); + while (j < common_length) { if (last_value_view[j] != view[j]) { break; } From 8994bf62fec4b2ed73517918a3ba9939ac617043 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 03:00:39 +0200 Subject: [PATCH 17/78] Review feedback --- cpp/src/parquet/encoding.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 8d08cfb6b57..098303aa8e6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3080,7 +3080,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
::Put(const T* src, int num_values) { auto view = string_view{reinterpret_cast(value->ptr), value->len}; uint32_t j = 0; - int32_t common_length = std::min(value->len, static_cast(last_value_view.length())); + uint32_t common_length = + std::min(value->len, static_cast(last_value_view.length())); while (j < common_length) { if (last_value_view[j] != view[j]) { break; From 925f1f7da520f0eff0d2f1afda91efa972056782 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 10:06:33 +0200 Subject: [PATCH 18/78] CheckDecode Co-authored-by: mwish <1506118561@qq.com> --- cpp/src/parquet/encoding.cc | 4 +- cpp/src/parquet/encoding_test.cc | 107 +++++++++++++++++++++++++++---- 2 files changed, 96 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 098303aa8e6..cc5fb4b08c6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3080,7 +3080,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
::Put(const T* src, int num_values) { auto view = string_view{reinterpret_cast(value->ptr), value->len}; uint32_t j = 0; - uint32_t common_length = + const uint32_t common_length = std::min(value->len, static_cast(last_value_view.length())); while (j < common_length) { if (last_value_view[j] != view[j]) { diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index a9abdc9a2b0..0947dc5e41a 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2151,7 +2151,6 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); ASSERT_NO_THROW(suffix_lengths_encoder->Put(*suffix_lengths)); auto suffix_lengths_buf = suffix_lengths_encoder->FlushValues(); - auto encoded_values_buf = SliceBuffer(buf, prefix_lengths_buf->size() + suffix_lengths_buf->size()); @@ -2163,17 +2162,99 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { EXPECT_EQ(value, encoded_values_buf->ToString()); }; - auto values = R"(["axis", "axle", "babble", "babyhood"])"; - auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 2, 0, 3])"); - auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([4, 2, 6, 5])"); - - CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), prefix_lengths, - suffix_lengths, "axislebabbleyhood"); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), prefix_lengths, - suffix_lengths, "axislebabbleyhood"); - CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), prefix_lengths, - suffix_lengths, "axislebabbleyhood"); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), prefix_lengths, - suffix_lengths, "axislebabbleyhood"); + auto arrayToI32 = [](const std::shared_ptr<::arrow::Array>& lengths) { + std::vector arrays; + auto data_ptr = checked_cast<::arrow::Int32Array*>(lengths.get()); + for (int i = 0; i < lengths->length(); ++i) { + arrays.push_back(data_ptr->GetView(i)); + } + return arrays; + }; + + auto CheckDecode = [](std::shared_ptr buf, + std::shared_ptr<::arrow::Array> values) { + int num_values = static_cast(values->length()); + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + + typename EncodingTraits::Accumulator acc; + if (::arrow::is_string(values->type()->id())) { + acc.builder = std::make_unique<::arrow::StringBuilder>(); + } else { + acc.builder = std::make_unique<::arrow::BinaryBuilder>(); + } + + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ASSERT_EQ(num_values, result->length()); + ASSERT_OK(result->ValidateFull()); + + auto upcast_result = CastBinaryTypesHelper(result, values->type()); + ::arrow::AssertArraysEqual(*values, *upcast_result); + }; + + auto checkEncodeDecode = [&](std::string_view values, + std::shared_ptr<::arrow::Array> prefix_lengths, + std::shared_ptr<::arrow::Array> suffix_lengths, + std::string_view suffix_data) { + CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), prefix_lengths, + suffix_lengths, suffix_data); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), prefix_lengths, + suffix_lengths, suffix_data); + CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), prefix_lengths, + suffix_lengths, suffix_data); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), prefix_lengths, + suffix_lengths, suffix_data); + + auto encoded = ::arrow::ConcatenateBuffers({DeltaEncode(arrayToI32(prefix_lengths)), + DeltaEncode(arrayToI32(suffix_lengths)), + std::make_shared(suffix_data)}) + .ValueOrDie(); + + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::utf8(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_utf8(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::binary(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); + }; + + { + auto values = R"(["axis", "axle", "babble", "babyhood"])"; + auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 2, 0, 3])"); + auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([4, 2, 6, 5])"); + + constexpr std::string_view suffix_data = "axislebabbleyhood"; + checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + } + + { + auto values = R"(["axis", "axis", "axis", "axis"])"; + auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 4, 4, 4])"); + auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([4, 0, 0, 0])"); + + constexpr std::string_view suffix_data = "axis"; + checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + } + + { + auto values = R"(["axisba", "axis", "axis", "axis"])"; + auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 4, 4, 4])"); + auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([6, 0, 0, 0])"); + + constexpr std::string_view suffix_data = "axisba"; + checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + } + { + auto values = R"(["baaxis", "axis", "axis", "axis"])"; + auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 0, 4, 4])"); + auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([6, 4, 0, 0])"); + + constexpr std::string_view suffix_data = "baaxisaxis"; + checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + } } } // namespace parquet::test From 800c3f8da782f8001a4f59be9136437d4759a23c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 31 Mar 2023 22:16:43 +0200 Subject: [PATCH 19/78] Work --- cpp/src/parquet/encoding.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index cc5fb4b08c6..67942859f48 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3080,8 +3080,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
::Put(const T* src, int num_values) { auto view = string_view{reinterpret_cast(value->ptr), value->len}; uint32_t j = 0; - const uint32_t common_length = - std::min(value->len, static_cast(last_value_view.length())); - while (j < common_length) { + while (j < std::min(value->len, static_cast(last_value_view.length()))) { if (last_value_view[j] != view[j]) { break; } From 10cadb1a628f10f97121ba0bcc69ca15df1145c9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 4 Apr 2023 15:01:41 +0200 Subject: [PATCH 20/78] Review feedback --- cpp/src/parquet/encoding.cc | 105 ++++++++++++++++++++++++++----- cpp/src/parquet/encoding_test.cc | 9 ++- 2 files changed, 93 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 67942859f48..f6c1cacd3c2 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -56,6 +56,8 @@ using arrow::Status; using arrow::VisitNullBitmapInline; using arrow::internal::AddWithOverflow; using arrow::internal::checked_cast; +using arrow::internal::MultiplyWithOverflow; +using arrow::internal::SubtractWithOverflow; using arrow::util::SafeLoad; using arrow::util::SafeLoadAs; using std::string_view; @@ -1186,6 +1188,11 @@ struct ArrowBinaryHelper { explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* out) { this->out = out; this->builder = out->builder.get(); + if (SubtractWithOverflow(::arrow::kBinaryMemoryLimit, + this->builder->value_data_length(), + &this->chunk_space_remaining)) { + throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + } this->chunk_space_remaining = ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); } @@ -1224,8 +1231,11 @@ template <> struct ArrowBinaryHelper { explicit ArrowBinaryHelper(EncodingTraits::Accumulator* builder) { this->builder = builder; - this->chunk_space_remaining = - ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); + if (SubtractWithOverflow(::arrow::kBinaryMemoryLimit, + this->builder->value_data_length(), + &this->chunk_space_remaining)) { + throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + } } Status PushChunk() { @@ -1457,7 +1467,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { byte_array_offsets_(AllocateBuffer(pool, 0)), indices_scratch_space_(AllocateBuffer(pool, 0)) {} - // Perform type-specific initiatialization + // Perform type-specific initialization void SetDict(TypedDecoder* dictionary) override; void SetData(int num_values, const uint8_t* data, int len) override { @@ -3053,9 +3063,9 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + PARQUET_ASSIGN_OR_THROW( + buffer_, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); + T* data = reinterpret_cast(buffer_->mutable_data()); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -3067,7 +3077,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
void PutBinaryArray(const ArrayType& array) { - uint32_t previous_len = static_cast(last_value_.size()); + auto previous_len = static_cast(last_value_.size()); std::string_view last_value_view = last_value_; PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( @@ -3080,7 +3090,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(j)}, 1); const uint8_t* suffix_ptr = src.ptr + j; - const uint32_t suffix_length = static_cast(src.len - j); + const auto suffix_length = static_cast(src.len - j); last_value_view = view; // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); @@ -3106,6 +3117,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_length_encoder_; DeltaLengthByteArrayEncoder suffix_encoder_; std::string last_value_; + std::unique_ptr<::arrow::Buffer> buffer_; }; template @@ -3117,8 +3129,7 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { ::arrow::stl::allocator(pool_)); std::string_view last_value_view = last_value_; - int i = 0; - while (i < num_values) { + for (int i = 0; i < num_values; i++) { // Convert to ByteArray so we can pass to the suffix_encoder_. auto value = reinterpret_cast(&src[i]); if (ARROW_PREDICT_FALSE(value->len >= kMaxByteArraySize)) { @@ -3127,7 +3138,56 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { auto view = string_view{reinterpret_cast(value->ptr), value->len}; uint32_t j = 0; - while (j < std::min(value->len, static_cast(last_value_view.length()))) { + const uint32_t common_length = + std::min(value->len, static_cast(last_value_view.length())); + while (j < common_length) { + if (last_value_view[j] != view[j]) { + break; + } + j++; + } + + prefix_lengths[i] = j; + const auto suffix_length = static_cast(value->len - j); + const uint8_t* suffix_ptr; + if (suffix_length == 0) { + suffix_ptr = reinterpret_cast(""); + } else { + suffix_ptr = value->ptr + j; + } + last_value_view = view; + // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); + } + prefix_length_encoder_.Put(prefix_lengths.data(), num_values); + last_value_ = last_value_view; +} + +template <> +void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { + if (num_values == 0) { + return; + } + ArrowPoolVector prefix_lengths(num_values, + ::arrow::stl::allocator(pool_)); + std::string_view last_value_view = last_value_; + const int32_t len = descr_->type_length(); + + if (ARROW_PREDICT_FALSE(len >= static_cast(kMaxByteArraySize))) { + throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); + } + + for (int i = 0; i < num_values; i++) { + // Convert to ByteArray so we can pass to the suffix_encoder_. + const FLBA* value = reinterpret_cast(&src[i].ptr); + + auto view = string_view{reinterpret_cast(value->ptr), + static_cast(len)}; + int32_t j = 0; + const int32_t common_length = + std::min(len, static_cast(last_value_view.length())); + while (j < common_length) { if (last_value_view[j] != view[j]) { break; } @@ -3135,13 +3195,13 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { } prefix_lengths[i] = j; - const uint8_t* suffix_ptr = value->ptr + j; - const uint32_t suffix_length = static_cast(value->len - j); + const auto suffix_length = static_cast(len - j); + const uint8_t* suffix_ptr; + suffix_ptr = value->ptr + j; last_value_view = view; // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); - i++; } prefix_length_encoder_.Put(prefix_lengths.data(), num_values); last_value_ = last_value_view; @@ -3360,7 +3420,20 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl, return GetInternal(buffer, max_values); } int Decode(FixedLenByteArray* buffer, int max_values) override { - throw ParquetException("Cannot decode DeltaByteArray from FixedLenByteArray"); + int decoded_values_size = max_values; + if (MultiplyWithOverflow(decoded_values_size, + descr_->type_length() * sizeof(ByteArray), + &decoded_values_size)) { + throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + } + std::vector decode_values(decoded_values_size); + auto decode_buf = reinterpret_cast(decode_values.data()); + + max_values = GetInternal(decode_buf, max_values); + for (int i = 0; i < max_values; i++) { + buffer[i].ptr = decode_buf->ptr + i * descr_->type_length(); + } + return max_values; } }; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 0947dc5e41a..c9dc8427874 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1986,7 +1986,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; - virtual void CheckRoundtrip() { + void CheckRoundtrip() override { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, false, descr_.get()); auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); @@ -2001,7 +2001,8 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { ASSERT_NO_FATAL_FAILURE(VerifyResults(decode_buf_, draws_, num_values_)); } - void CheckRoundtripSpaced(const uint8_t* valid_bits, int64_t valid_bits_offset) { + void CheckRoundtripSpaced(const uint8_t* valid_bits, + int64_t valid_bits_offset) override { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, false, descr_.get()); auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); @@ -2027,11 +2028,9 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { USING_BASE_MEMBERS(); }; -typedef ::testing::Types TestDeltaByteArrayEncodingTypes; +typedef ::testing::Types TestDeltaByteArrayEncodingTypes; TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); -// TODO: add FLBAType and Decimal type tests - TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); ASSERT_NO_FATAL_FAILURE(this->Execute(250, 2)); From 597f56700ec2f997ff849b8d6864ce3740b39687 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 4 Apr 2023 22:09:03 +0200 Subject: [PATCH 21/78] Review feedback --- cpp/src/parquet/encoding.cc | 19 ++++++++++++++----- cpp/src/parquet/encoding_test.cc | 19 ++++++++++++++----- docs/source/cpp/parquet.rst | 2 +- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f6c1cacd3c2..93cdc9eaf5d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3100,9 +3100,14 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(j)}, 1); - const uint8_t* suffix_ptr = src.ptr + j; - const auto suffix_length = static_cast(src.len - j); last_value_view = view; + const auto suffix_length = static_cast(src.len - j); + const uint8_t* suffix_ptr; + if (suffix_length == 0) { + suffix_ptr = reinterpret_cast(""); + } else { + suffix_ptr = src.ptr + j; + } // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3147,6 +3152,7 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { j++; } + last_value_view = view; prefix_lengths[i] = j; const auto suffix_length = static_cast(value->len - j); const uint8_t* suffix_ptr; @@ -3155,7 +3161,6 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { } else { suffix_ptr = value->ptr + j; } - last_value_view = view; // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3194,11 +3199,15 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { j++; } + last_value_view = view; prefix_lengths[i] = j; const auto suffix_length = static_cast(len - j); const uint8_t* suffix_ptr; - suffix_ptr = value->ptr + j; - last_value_view = view; + if (suffix_length == 0) { + suffix_ptr = reinterpret_cast(""); + } else { + suffix_ptr = value->ptr + j; + } // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index c9dc8427874..7439d887809 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2197,7 +2197,7 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { ::arrow::AssertArraysEqual(*values, *upcast_result); }; - auto checkEncodeDecode = [&](std::string_view values, + auto CheckEncodeDecode = [&](std::string_view values, std::shared_ptr<::arrow::Array> prefix_lengths, std::shared_ptr<::arrow::Array> suffix_lengths, std::string_view suffix_data) { @@ -2227,7 +2227,7 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([4, 2, 6, 5])"); constexpr std::string_view suffix_data = "axislebabbleyhood"; - checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + CheckEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); } { @@ -2236,7 +2236,7 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([4, 0, 0, 0])"); constexpr std::string_view suffix_data = "axis"; - checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + CheckEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); } { @@ -2245,15 +2245,24 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([6, 0, 0, 0])"); constexpr std::string_view suffix_data = "axisba"; - checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + CheckEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); } + { auto values = R"(["baaxis", "axis", "axis", "axis"])"; auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 0, 4, 4])"); auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([6, 4, 0, 0])"); constexpr std::string_view suffix_data = "baaxisaxis"; - checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + CheckEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); + } + + { + auto values = R"(["καλημέρα", "καμηλιέρη", "καμηλιέρη", "καλημέρα"])"; + auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 5, 18, 5])"); + auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([16, 13, 0, 11])"); + const std::string suffix_data = "καλημέρα\xbcηλιέρη\xbbημέρα"; + CheckEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data); } } } // namespace parquet::test diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 95f2d8d98dc..23fca8fd730 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -401,7 +401,7 @@ Encodings +--------------------------+----------+----------+---------+ | DELTA_BINARY_PACKED | ✓ | ✓ | | +--------------------------+----------+----------+---------+ -| DELTA_BYTE_ARRAY | ✓ | | | +| DELTA_BYTE_ARRAY | ✓ | ✓ | | +--------------------------+----------+----------+---------+ | DELTA_LENGTH_BYTE_ARRAY | ✓ | ✓ | | +--------------------------+----------+----------+---------+ From 74fbbdc0f3f4b5e5d62e170330d15ddf9209747a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 5 Apr 2023 09:26:16 +0200 Subject: [PATCH 22/78] Review feedback --- cpp/src/parquet/encoding.cc | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 93cdc9eaf5d..1cfa2ebfe3f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3086,7 +3086,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { return Status::Invalid("Parquet cannot store strings with size 2GB or more"); } - // Convert view to ByteArray so it can be passed to the suffix_encoder_. + // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray src{view}; uint32_t j = 0; @@ -3104,11 +3104,11 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(src.len - j); const uint8_t* suffix_ptr; if (suffix_length == 0) { - suffix_ptr = reinterpret_cast(""); + suffix_ptr = nullptr; } else { suffix_ptr = src.ptr + j; } - // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. + // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3127,6 +3127,11 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
void DeltaByteArrayEncoder::Put(const T* src, int num_values) { + throw Status::Invalid("Put not implemented for " + this->descr_->ToString()); +} + +template <> +void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_values) { if (num_values == 0) { return; } @@ -3135,7 +3140,7 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { std::string_view last_value_view = last_value_; for (int i = 0; i < num_values; i++) { - // Convert to ByteArray so we can pass to the suffix_encoder_. + // Convert to ByteArray, so we can pass to the suffix_encoder_. auto value = reinterpret_cast(&src[i]); if (ARROW_PREDICT_FALSE(value->len >= kMaxByteArraySize)) { throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); @@ -3157,11 +3162,11 @@ void DeltaByteArrayEncoder::Put(const T* src, int num_values) { const auto suffix_length = static_cast(value->len - j); const uint8_t* suffix_ptr; if (suffix_length == 0) { - suffix_ptr = reinterpret_cast(""); + suffix_ptr = nullptr; } else { suffix_ptr = value->ptr + j; } - // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. + // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); } @@ -3184,7 +3189,7 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { } for (int i = 0; i < num_values; i++) { - // Convert to ByteArray so we can pass to the suffix_encoder_. + // Convert to FLBA, so we can access the data const FLBA* value = reinterpret_cast(&src[i].ptr); auto view = string_view{reinterpret_cast(value->ptr), @@ -3204,11 +3209,11 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { const auto suffix_length = static_cast(len - j); const uint8_t* suffix_ptr; if (suffix_length == 0) { - suffix_ptr = reinterpret_cast(""); + suffix_ptr = nullptr; } else { suffix_ptr = value->ptr + j; } - // Convert suffix to ByteArray so it can be passed to the suffix_encoder_. + // Convert to ByteArray, so it can be passed to the suffix_encoder_ const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); } From 703b8b6a847bd49f3181cc6f0897bfd9ef514318 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Apr 2023 01:42:42 +0200 Subject: [PATCH 23/78] Work --- cpp/src/parquet/encoding.cc | 43 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1cfa2ebfe3f..609d2e44a38 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3045,7 +3045,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
FlushValues() override; @@ -3102,12 +3103,11 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(src.len - j); - const uint8_t* suffix_ptr; if (suffix_length == 0) { - suffix_ptr = nullptr; - } else { - suffix_ptr = src.ptr + j; + suffix_encoder_.Put(&kEmpty, 1); + return Status::OK(); } + const uint8_t* suffix_ptr = src.ptr + j; // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3122,6 +3122,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_length_encoder_; DeltaLengthByteArrayEncoder suffix_encoder_; std::string last_value_; + const ByteArray kEmpty; std::unique_ptr<::arrow::Buffer> buffer_; }; @@ -3141,15 +3142,16 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val for (int i = 0; i < num_values; i++) { // Convert to ByteArray, so we can pass to the suffix_encoder_. - auto value = reinterpret_cast(&src[i]); - if (ARROW_PREDICT_FALSE(value->len >= kMaxByteArraySize)) { + const ByteArray value = src[i]; + if (ARROW_PREDICT_FALSE(value.len >= static_cast(kMaxByteArraySize))) { throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); } - auto view = string_view{reinterpret_cast(value->ptr), value->len}; + auto view = string_view{reinterpret_cast(value.ptr), + static_cast(value.len)}; uint32_t j = 0; const uint32_t common_length = - std::min(value->len, static_cast(last_value_view.length())); + std::min(value.len, static_cast(last_value_view.length())); while (j < common_length) { if (last_value_view[j] != view[j]) { break; @@ -3159,13 +3161,13 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val last_value_view = view; prefix_lengths[i] = j; - const auto suffix_length = static_cast(value->len - j); - const uint8_t* suffix_ptr; + const auto suffix_length = static_cast(value.len - j); + if (suffix_length == 0) { - suffix_ptr = nullptr; - } else { - suffix_ptr = value->ptr + j; + suffix_encoder_.Put(&kEmpty, 1); + continue; } + const uint8_t* suffix_ptr = value.ptr + j; // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); @@ -3189,10 +3191,7 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { } for (int i = 0; i < num_values; i++) { - // Convert to FLBA, so we can access the data - const FLBA* value = reinterpret_cast(&src[i].ptr); - - auto view = string_view{reinterpret_cast(value->ptr), + auto view = string_view{reinterpret_cast(src[i].ptr), static_cast(len)}; int32_t j = 0; const int32_t common_length = @@ -3207,12 +3206,12 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { last_value_view = view; prefix_lengths[i] = j; const auto suffix_length = static_cast(len - j); - const uint8_t* suffix_ptr; + if (suffix_length == 0) { - suffix_ptr = nullptr; - } else { - suffix_ptr = value->ptr + j; + suffix_encoder_.Put(&kEmpty, 1); + continue; } + const uint8_t* suffix_ptr = src[i].ptr + j; // Convert to ByteArray, so it can be passed to the suffix_encoder_ const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); From 6949b9896a806f1938689a90f36dc493208ea3bf Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Apr 2023 11:38:15 +0200 Subject: [PATCH 24/78] Review feedback --- cpp/src/parquet/encoding.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 609d2e44a38..6c05b7e0a0f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3164,7 +3164,6 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val const auto suffix_length = static_cast(value.len - j); if (suffix_length == 0) { - suffix_encoder_.Put(&kEmpty, 1); continue; } const uint8_t* suffix_ptr = value.ptr + j; @@ -3208,7 +3207,6 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { const auto suffix_length = static_cast(len - j); if (suffix_length == 0) { - suffix_encoder_.Put(&kEmpty, 1); continue; } const uint8_t* suffix_ptr = src[i].ptr + j; From ea6704925a9ba25b726ee3c1231b2b0b1a1e283d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Apr 2023 13:14:13 +0200 Subject: [PATCH 25/78] Work --- cpp/src/parquet/encoding.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6c05b7e0a0f..74319a19404 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3045,8 +3045,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
FlushValues() override; @@ -3104,7 +3103,9 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(src.len - j); if (suffix_length == 0) { - suffix_encoder_.Put(&kEmpty, 1); + const auto suffix_ptr = reinterpret_cast(""); + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); return Status::OK(); } const uint8_t* suffix_ptr = src.ptr + j; @@ -3122,7 +3123,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_length_encoder_; DeltaLengthByteArrayEncoder suffix_encoder_; std::string last_value_; - const ByteArray kEmpty; std::unique_ptr<::arrow::Buffer> buffer_; }; From 9c313988d16b1f065f1a7ce4e5e2f8cb0de4a6d0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Apr 2023 14:16:01 +0200 Subject: [PATCH 26/78] Change to zero length suffix --- cpp/src/parquet/encoding.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 74319a19404..9ded9f40538 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3103,8 +3103,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(src.len - j); if (suffix_length == 0) { - const auto suffix_ptr = reinterpret_cast(""); - const ByteArray suffix(suffix_length, suffix_ptr); + const ByteArray suffix(suffix_length, nullptr); suffix_encoder_.Put(&suffix, 1); return Status::OK(); } From 0d5140c499f6ade1ab95ec808052f9675ce181c0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 6 Apr 2023 15:31:44 +0200 Subject: [PATCH 27/78] Work --- cpp/src/parquet/encoding.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9ded9f40538..1786099a62e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3045,7 +3045,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(""))) {} std::shared_ptr FlushValues() override; @@ -3103,8 +3104,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(src.len - j); if (suffix_length == 0) { - const ByteArray suffix(suffix_length, nullptr); - suffix_encoder_.Put(&suffix, 1); + suffix_encoder_.Put(&kEmpty, 1); return Status::OK(); } const uint8_t* suffix_ptr = src.ptr + j; @@ -3123,6 +3123,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
suffix_encoder_; std::string last_value_; std::unique_ptr<::arrow::Buffer> buffer_; + const ByteArray kEmpty; }; template From d2bfd7f488a447cc16f3bfea10dcca08941dd0de Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 7 Apr 2023 11:21:02 +0200 Subject: [PATCH 28/78] Review feedback --- cpp/src/parquet/encoding.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1786099a62e..6f051fca8e7 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1188,9 +1188,9 @@ struct ArrowBinaryHelper { explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* out) { this->out = out; this->builder = out->builder.get(); - if (SubtractWithOverflow(::arrow::kBinaryMemoryLimit, - this->builder->value_data_length(), - &this->chunk_space_remaining)) { + if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, + this->builder->value_data_length(), + &this->chunk_space_remaining))) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } this->chunk_space_remaining = @@ -1231,9 +1231,9 @@ template <> struct ArrowBinaryHelper { explicit ArrowBinaryHelper(EncodingTraits::Accumulator* builder) { this->builder = builder; - if (SubtractWithOverflow(::arrow::kBinaryMemoryLimit, - this->builder->value_data_length(), - &this->chunk_space_remaining)) { + if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, + this->builder->value_data_length(), + &this->chunk_space_remaining))) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } } @@ -3164,6 +3164,7 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val const auto suffix_length = static_cast(value.len - j); if (suffix_length == 0) { + suffix_encoder_.Put(&kEmpty, 1); continue; } const uint8_t* suffix_ptr = value.ptr + j; @@ -3207,6 +3208,7 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { const auto suffix_length = static_cast(len - j); if (suffix_length == 0) { + suffix_encoder_.Put(&kEmpty, 1); continue; } const uint8_t* suffix_ptr = src[i].ptr + j; From 4d1debf7487658405850d14579400ff638426881 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Apr 2023 01:57:49 +0200 Subject: [PATCH 29/78] Review feedback --- cpp/src/parquet/encoding.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6f051fca8e7..46e0d691fd6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3065,7 +3065,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
memory_pool())); + auto buffer_, + ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); T* data = reinterpret_cast(buffer_->mutable_data()); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); @@ -3122,13 +3123,12 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_length_encoder_; DeltaLengthByteArrayEncoder suffix_encoder_; std::string last_value_; - std::unique_ptr<::arrow::Buffer> buffer_; const ByteArray kEmpty; }; template void DeltaByteArrayEncoder::Put(const T* src, int num_values) { - throw Status::Invalid("Put not implemented for " + this->descr_->ToString()); + throw ParquetException("Put not implemented for " + this->descr_->ToString()); } template <> @@ -3144,7 +3144,7 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val // Convert to ByteArray, so we can pass to the suffix_encoder_. const ByteArray value = src[i]; if (ARROW_PREDICT_FALSE(value.len >= static_cast(kMaxByteArraySize))) { - throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); + throw ParquetException("Parquet cannot store strings with size 2GB or more"); } auto view = string_view{reinterpret_cast(value.ptr), From 37e543611fbc6061cd76bae2c6c8e44530d656cd Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Apr 2023 02:46:35 +0200 Subject: [PATCH 30/78] Review feedback --- cpp/src/parquet/encoding.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 46e0d691fd6..c8c9c5f5b58 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3260,6 +3260,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode explicit DeltaByteArrayDecoderImpl(const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), + pool_(pool), prefix_len_decoder_(nullptr, pool), suffix_decoder_(nullptr, pool), last_value_in_previous_page_(""), @@ -3400,6 +3401,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode return Status::OK(); } + MemoryPool* pool_; + private: std::shared_ptr<::arrow::bit_util::BitReader> decoder_; DeltaBitPackDecoder prefix_len_decoder_; @@ -3428,6 +3431,7 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl, public: using Base = DeltaByteArrayDecoderImpl; using Base::DeltaByteArrayDecoderImpl; + using Base::pool_; int Decode(ByteArray* buffer, int max_values) { return GetInternal(buffer, max_values); @@ -3439,7 +3443,8 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl, &decoded_values_size)) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } - std::vector decode_values(decoded_values_size); + ArrowPoolVector decode_values(decoded_values_size, + ::arrow::stl::allocator(pool_)); auto decode_buf = reinterpret_cast(decode_values.data()); max_values = GetInternal(decode_buf, max_values); From 58c89bc159b8381302e9f3a929e5dac7711cacd9 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Apr 2023 11:01:17 +0200 Subject: [PATCH 31/78] Review feedback --- cpp/src/parquet/encoding.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c8c9c5f5b58..c75e7a91f19 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3064,10 +3064,9 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
memory_pool())); - T* data = reinterpret_cast(buffer_->mutable_data()); + PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), + this->memory_pool())); + T* data = reinterpret_cast(buffer->mutable_data()); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -3187,7 +3186,7 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { const int32_t len = descr_->type_length(); if (ARROW_PREDICT_FALSE(len >= static_cast(kMaxByteArraySize))) { - throw Status::Invalid("Parquet cannot store strings with size 2GB or more"); + throw ParquetException("Parquet cannot store strings with size 2GB or more"); } for (int i = 0; i < num_values; i++) { From 562edd8898bb3c6f61d79d90dc385c6697005a87 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 2 May 2023 20:54:36 +0200 Subject: [PATCH 32/78] Change exception message. --- cpp/src/parquet/encoding.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c75e7a91f19..0db24168360 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1191,7 +1191,7 @@ struct ArrowBinaryHelper { if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, this->builder->value_data_length(), &this->chunk_space_remaining))) { - throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + throw ParquetException("excess expansion in ArrowBinaryHelper"); } this->chunk_space_remaining = ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); @@ -1234,7 +1234,7 @@ struct ArrowBinaryHelper { if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, this->builder->value_data_length(), &this->chunk_space_remaining))) { - throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + throw ParquetException("excess expansion in ArrowBinaryHelper"); } } From 01f8f941617029de82ed2ffd43874bd9215bfa0d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 May 2023 19:00:11 +0200 Subject: [PATCH 33/78] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- cpp/src/parquet/encoding_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 7439d887809..903a4896345 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1988,7 +1988,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { void CheckRoundtrip() override { auto encoder = - MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, false, descr_.get()); + MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, /*xxx=*/ false, descr_.get()); auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); encoder->Put(draws_, num_values_); @@ -2028,7 +2028,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { USING_BASE_MEMBERS(); }; -typedef ::testing::Types TestDeltaByteArrayEncodingTypes; +using TestDeltaByteArrayEncodingTypes = ::testing::Types; TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { From e6cd16ba90222bfaa14c94bb4c23685581e889e7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 17 May 2023 01:42:15 +0200 Subject: [PATCH 34/78] Review feedback --- cpp/src/parquet/encoding.cc | 17 +++-------- cpp/src/parquet/encoding_test.cc | 50 +++++++------------------------- cpp/src/parquet/types.h | 5 ++++ 3 files changed, 20 insertions(+), 52 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 0db24168360..ab277578db2 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3125,11 +3125,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
-void DeltaByteArrayEncoder::Put(const T* src, int num_values) { - throw ParquetException("Put not implemented for " + this->descr_->ToString()); -} - template <> void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_values) { if (num_values == 0) { @@ -3145,9 +3140,8 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val if (ARROW_PREDICT_FALSE(value.len >= static_cast(kMaxByteArraySize))) { throw ParquetException("Parquet cannot store strings with size 2GB or more"); } + auto view = std::string_view{value}; - auto view = string_view{reinterpret_cast(value.ptr), - static_cast(value.len)}; uint32_t j = 0; const uint32_t common_length = std::min(value.len, static_cast(last_value_view.length())); @@ -3163,7 +3157,7 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val const auto suffix_length = static_cast(value.len - j); if (suffix_length == 0) { - suffix_encoder_.Put(&kEmpty, 1); + // suffix_encoder_.Put(&kEmpty, 1); continue; } const uint8_t* suffix_ptr = value.ptr + j; @@ -3207,7 +3201,7 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { const auto suffix_length = static_cast(len - j); if (suffix_length == 0) { - suffix_encoder_.Put(&kEmpty, 1); + // suffix_encoder_.Put(&kEmpty, 1); continue; } const uint8_t* suffix_ptr = src[i].ptr + j; @@ -3352,7 +3346,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode buffer[i].ptr = data_ptr; buffer[i].len += prefix_len_ptr[i]; data_ptr += buffer[i].len; - prefix = string_view{reinterpret_cast(buffer[i].ptr), buffer[i].len}; + prefix = std::string_view{buffer[i]}; } prefix_len_offset_ += max_values; this->num_values_ -= max_values; @@ -3432,9 +3426,6 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl, using Base::DeltaByteArrayDecoderImpl; using Base::pool_; - int Decode(ByteArray* buffer, int max_values) { - return GetInternal(buffer, max_values); - } int Decode(FixedLenByteArray* buffer, int max_values) override { int decoded_values_size = max_values; if (MultiplyWithOverflow(decoded_values_size, diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 903a4896345..3794a8fc6aa 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1894,7 +1894,6 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { ASSERT_EQ(values->length(), result->length()); ASSERT_OK(result->ValidateFull()); - auto upcast_result = CastBinaryTypesHelper(result, values->type()); ::arrow::AssertArraysEqual(*values, *result); }; @@ -1987,8 +1986,8 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void CheckRoundtrip() override { - auto encoder = - MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, /*xxx=*/ false, descr_.get()); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); encoder->Put(draws_, num_values_); @@ -2003,8 +2002,8 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { void CheckRoundtripSpaced(const uint8_t* valid_bits, int64_t valid_bits_offset) override { - auto encoder = - MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, false, descr_.get()); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr_.get()); auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); int null_count = 0; for (auto i = 0; i < num_values_; i++) { @@ -2075,7 +2074,6 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { ASSERT_EQ(values->length(), result->length()); ASSERT_OK(result->ValidateFull()); - auto upcast_result = CastBinaryTypesHelper(result, values->type()); ::arrow::AssertArraysEqual(*values, *result); }; @@ -2132,33 +2130,11 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPutFixedLength) { TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto CheckEncode = [](std::shared_ptr<::arrow::Array> values, - std::shared_ptr<::arrow::Array> prefix_lengths, - std::shared_ptr<::arrow::Array> suffix_lengths, - std::string_view value) { + std::shared_ptr encoded) { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); ASSERT_NO_THROW(encoder->Put(*values)); auto buf = encoder->FlushValues(); - - auto prefix_lengths_encoder = - MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); - ASSERT_NO_THROW(prefix_lengths_encoder->Put(*prefix_lengths)); - auto prefix_lengths_buf = prefix_lengths_encoder->FlushValues(); - - auto encoded_prefix_lengths_buf = SliceBuffer(buf, 0, prefix_lengths_buf->size()); - - auto suffix_lengths_encoder = - MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); - ASSERT_NO_THROW(suffix_lengths_encoder->Put(*suffix_lengths)); - auto suffix_lengths_buf = suffix_lengths_encoder->FlushValues(); - auto encoded_values_buf = - SliceBuffer(buf, prefix_lengths_buf->size() + suffix_lengths_buf->size()); - - auto encoded_prefix_length_buf = SliceBuffer(buf, 0, prefix_lengths_buf->size()); - EXPECT_TRUE(prefix_lengths_buf->Equals(*encoded_prefix_length_buf)); - auto encoded_suffix_length_buf = - SliceBuffer(buf, prefix_lengths_buf->size(), suffix_lengths_buf->size()); - EXPECT_TRUE(suffix_lengths_buf->Equals(*encoded_suffix_length_buf)); - EXPECT_EQ(value, encoded_values_buf->ToString()); + ASSERT_TRUE(encoded->Equals(*buf)); }; auto arrayToI32 = [](const std::shared_ptr<::arrow::Array>& lengths) { @@ -2201,20 +2177,16 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { std::shared_ptr<::arrow::Array> prefix_lengths, std::shared_ptr<::arrow::Array> suffix_lengths, std::string_view suffix_data) { - CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), prefix_lengths, - suffix_lengths, suffix_data); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), prefix_lengths, - suffix_lengths, suffix_data); - CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), prefix_lengths, - suffix_lengths, suffix_data); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), prefix_lengths, - suffix_lengths, suffix_data); - auto encoded = ::arrow::ConcatenateBuffers({DeltaEncode(arrayToI32(prefix_lengths)), DeltaEncode(arrayToI32(suffix_lengths)), std::make_shared(suffix_data)}) .ValueOrDie(); + CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), encoded); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::utf8(), values)); CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_utf8(), values)); CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::binary(), values)); diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index e81e9de0a1e..0315376a883 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -577,6 +577,11 @@ struct ByteArray { ByteArray(::std::string_view view) // NOLINT implicit conversion : ByteArray(static_cast(view.size()), reinterpret_cast(view.data())) {} + + explicit operator std::string_view() const { + return std::string_view{reinterpret_cast(ptr), len}; + } + uint32_t len; const uint8_t* ptr; }; From c07c8657098795f6b7d1596af8c6166ac6886eb7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 17 May 2023 03:06:49 +0200 Subject: [PATCH 35/78] Chunk prefix lengths --- cpp/src/parquet/encoding.cc | 112 +++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 53 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index ab277578db2..736ad2a4649 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3046,7 +3046,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(""))) {} + kEmpty(ByteArray(0, reinterpret_cast(""))), + prefix_lengths_(kBatchSize_, ::arrow::stl::allocator(pool_)) {} std::shared_ptr FlushValues() override; @@ -3123,6 +3124,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
suffix_encoder_; std::string last_value_; const ByteArray kEmpty; + ArrowPoolVector prefix_lengths_; + static constexpr int kBatchSize_ = 256; }; template <> @@ -3130,42 +3133,44 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val if (num_values == 0) { return; } - ArrowPoolVector prefix_lengths(num_values, - ::arrow::stl::allocator(pool_)); + std::string_view last_value_view = last_value_; - for (int i = 0; i < num_values; i++) { - // Convert to ByteArray, so we can pass to the suffix_encoder_. - const ByteArray value = src[i]; - if (ARROW_PREDICT_FALSE(value.len >= static_cast(kMaxByteArraySize))) { - throw ParquetException("Parquet cannot store strings with size 2GB or more"); - } - auto view = std::string_view{value}; + for (int i = 0; i < num_values; i += kBatchSize_) { + const int batch_size = std::min(kBatchSize_, num_values - i); - uint32_t j = 0; - const uint32_t common_length = - std::min(value.len, static_cast(last_value_view.length())); - while (j < common_length) { - if (last_value_view[j] != view[j]) { - break; + for (int j = 0; j < batch_size; ++j) { + // Convert to ByteArray, so we can pass to the suffix_encoder_. + const ByteArray value = src[i + j]; + if (ARROW_PREDICT_FALSE(value.len >= static_cast(kMaxByteArraySize))) { + throw ParquetException("Parquet cannot store strings with size 2GB or more"); + } + auto view = std::string_view{value}; + + uint32_t k = 0; + const uint32_t common_length = + std::min(value.len, static_cast(last_value_view.length())); + while (k < common_length) { + if (last_value_view[k] != view[k]) { + break; + } + k++; } - j++; - } - last_value_view = view; - prefix_lengths[i] = j; - const auto suffix_length = static_cast(value.len - j); + last_value_view = view; + prefix_lengths_[j] = k; + const auto suffix_length = static_cast(value.len - k); - if (suffix_length == 0) { - // suffix_encoder_.Put(&kEmpty, 1); - continue; + if (suffix_length == 0) { + continue; + } + const uint8_t* suffix_ptr = value.ptr + k; + // Convert to ByteArray, so it can be passed to the suffix_encoder_. + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); } - const uint8_t* suffix_ptr = value.ptr + j; - // Convert to ByteArray, so it can be passed to the suffix_encoder_. - const ByteArray suffix(suffix_length, suffix_ptr); - suffix_encoder_.Put(&suffix, 1); + prefix_length_encoder_.Put(prefix_lengths_.data(), batch_size); } - prefix_length_encoder_.Put(prefix_lengths.data(), num_values); last_value_ = last_value_view; } @@ -3174,8 +3179,7 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { if (num_values == 0) { return; } - ArrowPoolVector prefix_lengths(num_values, - ::arrow::stl::allocator(pool_)); + std::string_view last_value_view = last_value_; const int32_t len = descr_->type_length(); @@ -3183,33 +3187,35 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { throw ParquetException("Parquet cannot store strings with size 2GB or more"); } - for (int i = 0; i < num_values; i++) { - auto view = string_view{reinterpret_cast(src[i].ptr), - static_cast(len)}; - int32_t j = 0; - const int32_t common_length = - std::min(len, static_cast(last_value_view.length())); - while (j < common_length) { - if (last_value_view[j] != view[j]) { - break; + for (int i = 0; i < num_values; i += kBatchSize_) { + const int batch_size = std::min(kBatchSize_, num_values - i); + for (int j = 0; j < batch_size; j++) { + auto view = string_view{reinterpret_cast(src[i + j].ptr), + static_cast(len)}; + int32_t k = 0; + const int32_t common_length = + std::min(len, static_cast(last_value_view.length())); + while (k < common_length) { + if (last_value_view[k] != view[k]) { + break; + } + k++; } - j++; - } - last_value_view = view; - prefix_lengths[i] = j; - const auto suffix_length = static_cast(len - j); + last_value_view = view; + prefix_lengths_[j] = k; + const auto suffix_length = static_cast(len - k); - if (suffix_length == 0) { - // suffix_encoder_.Put(&kEmpty, 1); - continue; + if (suffix_length == 0) { + continue; + } + const uint8_t* suffix_ptr = src[i + j].ptr + k; + // Convert to ByteArray, so it can be passed to the suffix_encoder_ + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); } - const uint8_t* suffix_ptr = src[i].ptr + j; - // Convert to ByteArray, so it can be passed to the suffix_encoder_ - const ByteArray suffix(suffix_length, suffix_ptr); - suffix_encoder_.Put(&suffix, 1); + prefix_length_encoder_.Put(prefix_lengths_.data(), batch_size); } - prefix_length_encoder_.Put(prefix_lengths.data(), num_values); last_value_ = last_value_view; } From ca3660d4196f0828cf27fe9e28bf0ec188b81b5b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 18 May 2023 01:28:22 +0200 Subject: [PATCH 36/78] Update cpp/src/parquet/encoding.cc Co-authored-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 736ad2a4649..b5b59c33034 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3161,9 +3161,6 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val prefix_lengths_[j] = k; const auto suffix_length = static_cast(value.len - k); - if (suffix_length == 0) { - continue; - } const uint8_t* suffix_ptr = value.ptr + k; // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); From 6951e037e1dbe871cbe572233adf0e85e30f6e41 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 18 May 2023 01:36:43 +0200 Subject: [PATCH 37/78] Review feedback --- cpp/src/parquet/encoding.cc | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b5b59c33034..a308ddfe01d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3046,8 +3046,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(""))), - prefix_lengths_(kBatchSize_, ::arrow::stl::allocator(pool_)) {} + kEmpty(ByteArray(0, reinterpret_cast(""))) {} std::shared_ptr FlushValues() override; @@ -3124,8 +3123,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
suffix_encoder_; std::string last_value_; const ByteArray kEmpty; - ArrowPoolVector prefix_lengths_; - static constexpr int kBatchSize_ = 256; }; template <> @@ -3135,9 +3132,11 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val } std::string_view last_value_view = last_value_; + constexpr int kBatchSize = 256; + std::array prefix_lengths; - for (int i = 0; i < num_values; i += kBatchSize_) { - const int batch_size = std::min(kBatchSize_, num_values - i); + for (int i = 0; i < num_values; i += kBatchSize) { + const int batch_size = std::min(kBatchSize, num_values - i); for (int j = 0; j < batch_size; ++j) { // Convert to ByteArray, so we can pass to the suffix_encoder_. @@ -3158,7 +3157,7 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val } last_value_view = view; - prefix_lengths_[j] = k; + prefix_lengths[j] = k; const auto suffix_length = static_cast(value.len - k); const uint8_t* suffix_ptr = value.ptr + k; @@ -3166,7 +3165,7 @@ void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_val const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); } - prefix_length_encoder_.Put(prefix_lengths_.data(), batch_size); + prefix_length_encoder_.Put(prefix_lengths.data(), batch_size); } last_value_ = last_value_view; } @@ -3184,8 +3183,10 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { throw ParquetException("Parquet cannot store strings with size 2GB or more"); } - for (int i = 0; i < num_values; i += kBatchSize_) { - const int batch_size = std::min(kBatchSize_, num_values - i); + constexpr int kBatchSize = 256; + std::array prefix_lengths; + for (int i = 0; i < num_values; i += kBatchSize) { + const int batch_size = std::min(kBatchSize, num_values - i); for (int j = 0; j < batch_size; j++) { auto view = string_view{reinterpret_cast(src[i + j].ptr), static_cast(len)}; @@ -3200,18 +3201,15 @@ void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { } last_value_view = view; - prefix_lengths_[j] = k; + prefix_lengths[j] = k; const auto suffix_length = static_cast(len - k); - if (suffix_length == 0) { - continue; - } const uint8_t* suffix_ptr = src[i + j].ptr + k; // Convert to ByteArray, so it can be passed to the suffix_encoder_ const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); } - prefix_length_encoder_.Put(prefix_lengths_.data(), batch_size); + prefix_length_encoder_.Put(prefix_lengths.data(), batch_size); } last_value_ = last_value_view; } From 02fe560f9bc88377e675a3ee20dc7bd41d2b6100 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 18 May 2023 01:37:28 +0200 Subject: [PATCH 38/78] Change data distribution --- cpp/src/parquet/encoding_test.cc | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 3794a8fc6aa..d346c3363cc 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1985,6 +1985,39 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; + void InitData(int nvalues) { + auto rand = ::arrow::random::RandomArrayGenerator(42); + const int min_prefix_length = 0; + const int max_prefix_length = 10; + const size_t max_element_length = 20; + + ::arrow::StringBuilder builder; + const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, /* min_length */ min_prefix_length, + /* max_length */ max_prefix_length, /*null_percent*/ 0)); + + std::string previous_element; + for (int i = 0; i < nvalues; i++) { + auto element = prefix_array->GetString(i); + + if (previous_element.length() <= max_element_length) { + previous_element = previous_element.append(element); + } else { + previous_element = element; + } + ASSERT_OK(builder.Append(previous_element)); + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + draws_ = reinterpret_cast(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, int repeats) { + InitData(nvalues); + CheckRoundtrip(); + } + void CheckRoundtrip() override { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, /*use_dictionary=*/false, descr_.get()); From 0f49067208d055259a785f7bbf0041d273eb55ad Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 May 2023 02:29:53 +0200 Subject: [PATCH 39/78] Refactor Put --- cpp/src/parquet/encoding.cc | 143 +++++++++++++++++------------------- 1 file changed, 66 insertions(+), 77 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a308ddfe01d..780dc24f23e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3076,6 +3076,49 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
+ void PutInternal(const T* src, int num_values) { + if (num_values == 0) { + return; + } + uint32_t len = descr_->type_length(); + + std::string_view last_value_view = last_value_; + constexpr int kBatchSize = 256; + std::array prefix_lengths; + auto visitor = VisitorType{src, len}; + + for (int i = 0; i < num_values; i += kBatchSize) { + const int batch_size = std::min(kBatchSize, num_values - i); + + for (int j = 0; j < batch_size; ++j) { + auto view = visitor[i + j]; + len = visitor.len(i + j); + + uint32_t k = 0; + const uint32_t common_length = + std::min(len, static_cast(last_value_view.length())); + while (k < common_length) { + if (last_value_view[k] != view[k]) { + break; + } + k++; + } + + last_value_view = view; + prefix_lengths[j] = k; + const auto suffix_length = len - k; + const uint8_t* suffix_ptr = src[i + j].ptr + k; + + // Convert to ByteArray, so it can be passed to the suffix_encoder_. + const ByteArray suffix(suffix_length, suffix_ptr); + suffix_encoder_.Put(&suffix, 1); + } + prefix_length_encoder_.Put(prefix_lengths.data(), batch_size); + } + last_value_ = last_value_view; + } + template void PutBinaryArray(const ArrayType& array) { auto previous_len = static_cast(last_value_.size()); @@ -3125,93 +3168,39 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
-void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_values) { - if (num_values == 0) { - return; - } - - std::string_view last_value_view = last_value_; - constexpr int kBatchSize = 256; - std::array prefix_lengths; - - for (int i = 0; i < num_values; i += kBatchSize) { - const int batch_size = std::min(kBatchSize, num_values - i); +struct ByteArrayVisitor { + const ByteArray* src; + const uint32_t length; - for (int j = 0; j < batch_size; ++j) { - // Convert to ByteArray, so we can pass to the suffix_encoder_. - const ByteArray value = src[i + j]; - if (ARROW_PREDICT_FALSE(value.len >= static_cast(kMaxByteArraySize))) { - throw ParquetException("Parquet cannot store strings with size 2GB or more"); - } - auto view = std::string_view{value}; - - uint32_t k = 0; - const uint32_t common_length = - std::min(value.len, static_cast(last_value_view.length())); - while (k < common_length) { - if (last_value_view[k] != view[k]) { - break; - } - k++; - } - - last_value_view = view; - prefix_lengths[j] = k; - const auto suffix_length = static_cast(value.len - k); - - const uint8_t* suffix_ptr = value.ptr + k; - // Convert to ByteArray, so it can be passed to the suffix_encoder_. - const ByteArray suffix(suffix_length, suffix_ptr); - suffix_encoder_.Put(&suffix, 1); + std::string_view operator[](int i) const { + if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) { + throw ParquetException("Parquet cannot store strings with size 2GB or more"); } - prefix_length_encoder_.Put(prefix_lengths.data(), batch_size); + return std::string_view{src[i]}; } - last_value_ = last_value_view; -} -template <> -void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { - if (num_values == 0) { - return; - } + const uint32_t len(int i) const { return src[i].len; } +}; - std::string_view last_value_view = last_value_; - const int32_t len = descr_->type_length(); +struct FLBAVisitor { + const FLBA* src; + const uint32_t length; - if (ARROW_PREDICT_FALSE(len >= static_cast(kMaxByteArraySize))) { - throw ParquetException("Parquet cannot store strings with size 2GB or more"); + std::string_view operator[](int i) const { + return std::string_view{reinterpret_cast(src[i].ptr)}; } - constexpr int kBatchSize = 256; - std::array prefix_lengths; - for (int i = 0; i < num_values; i += kBatchSize) { - const int batch_size = std::min(kBatchSize, num_values - i); - for (int j = 0; j < batch_size; j++) { - auto view = string_view{reinterpret_cast(src[i + j].ptr), - static_cast(len)}; - int32_t k = 0; - const int32_t common_length = - std::min(len, static_cast(last_value_view.length())); - while (k < common_length) { - if (last_value_view[k] != view[k]) { - break; - } - k++; - } + const uint32_t len(int i) const { return length; } +}; - last_value_view = view; - prefix_lengths[j] = k; - const auto suffix_length = static_cast(len - k); +template <> +void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_values) { + PutInternal(src, num_values); +} - const uint8_t* suffix_ptr = src[i + j].ptr + k; - // Convert to ByteArray, so it can be passed to the suffix_encoder_ - const ByteArray suffix(suffix_length, suffix_ptr); - suffix_encoder_.Put(&suffix, 1); - } - prefix_length_encoder_.Put(prefix_lengths.data(), batch_size); - } - last_value_ = last_value_view; +template <> +void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { + PutInternal(src, num_values); } template From ad90f19bb3e591d9254aaa39fac4adb90a7473b2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 May 2023 03:15:31 +0200 Subject: [PATCH 40/78] Batch suffixes --- cpp/src/parquet/encoding.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 780dc24f23e..ad608e60dcb 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3086,6 +3086,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_lengths; + std::array suffixes; auto visitor = VisitorType{src, len}; for (int i = 0; i < num_values; i += kBatchSize) { @@ -3112,8 +3113,9 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
Date: Fri, 19 May 2023 10:29:23 +0200 Subject: [PATCH 41/78] Work --- cpp/src/parquet/encoding.cc | 27 ++++++-------- cpp/src/parquet/encoding_test.cc | 64 ++++++++++++++++---------------- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index ad608e60dcb..90011a1d73d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3189,7 +3189,7 @@ struct FLBAVisitor { const uint32_t length; std::string_view operator[](int i) const { - return std::string_view{reinterpret_cast(src[i].ptr)}; + return std::string_view{reinterpret_cast(src[i].ptr), length}; } const uint32_t len(int i) const { return length; } @@ -3419,21 +3419,18 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl, using Base::pool_; int Decode(FixedLenByteArray* buffer, int max_values) override { - int decoded_values_size = max_values; - if (MultiplyWithOverflow(decoded_values_size, - descr_->type_length() * sizeof(ByteArray), - &decoded_values_size)) { - throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); - } - ArrowPoolVector decode_values(decoded_values_size, - ::arrow::stl::allocator(pool_)); - auto decode_buf = reinterpret_cast(decode_values.data()); - - max_values = GetInternal(decode_buf, max_values); - for (int i = 0; i < max_values; i++) { - buffer[i].ptr = decode_buf->ptr + i * descr_->type_length(); + // GetInternal currently only support ByteArray. + std::vector decode_byte_array(max_values); + const int decoded_values_size = GetInternal(decode_byte_array.data(), max_values); + const uint32_t type_length = descr_->type_length(); + + for (int i = 0; i < decoded_values_size; i++) { + if (ARROW_PREDICT_FALSE(decode_byte_array[i].len != type_length)) { + throw ParquetException("Fixed length byte array length mismatch"); + } + buffer[i].ptr = decode_byte_array.data()->ptr + i * type_length; } - return max_values; + return decoded_values_size; } }; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index d346c3363cc..99309957f8a 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1985,38 +1985,38 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; - void InitData(int nvalues) { - auto rand = ::arrow::random::RandomArrayGenerator(42); - const int min_prefix_length = 0; - const int max_prefix_length = 10; - const size_t max_element_length = 20; - - ::arrow::StringBuilder builder; - const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( - rand.String(nvalues, /* min_length */ min_prefix_length, - /* max_length */ max_prefix_length, /*null_percent*/ 0)); - - std::string previous_element; - for (int i = 0; i < nvalues; i++) { - auto element = prefix_array->GetString(i); - - if (previous_element.length() <= max_element_length) { - previous_element = previous_element.append(element); - } else { - previous_element = element; - } - ASSERT_OK(builder.Append(previous_element)); - } - - std::shared_ptr<::arrow::StringArray> array; - ASSERT_OK(builder.Finish(&array)); - draws_ = reinterpret_cast(array->value_data()->mutable_data()); - } - - void Execute(int nvalues, int repeats) { - InitData(nvalues); - CheckRoundtrip(); - } +// void InitData(int nvalues) { +// auto rand = ::arrow::random::RandomArrayGenerator(42); +// const int min_prefix_length = 0; +// const int max_prefix_length = 30; +// const size_t max_element_length = 10; +// +// ::arrow::StringBuilder builder; +// const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( +// rand.String(nvalues, /* min_length */ min_prefix_length, +// /* max_length */ max_prefix_length, /*null_percent*/ 0)); +// +// std::string previous_element; +// for (int i = 0; i < nvalues; i++) { +// auto element = prefix_array->GetString(i); +// +// if (previous_element.length() <= max_element_length) { +// previous_element = previous_element.append(element); +// } else { +// previous_element = element; +// } +// ASSERT_OK(builder.Append(previous_element)); +// } +// +// std::shared_ptr<::arrow::StringArray> array; +// ASSERT_OK(builder.Finish(&array)); +// draws_ = reinterpret_cast(array->value_data()->mutable_data()); +// } +// +// void Execute(int nvalues, int repeats) { +// InitData(nvalues); +// CheckRoundtrip(); +// } void CheckRoundtrip() override { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, From d78f3a918b11dcf61b60d712c1545b914cc9ab38 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 May 2023 12:29:14 +0200 Subject: [PATCH 42/78] Linting --- cpp/src/parquet/encoding.cc | 4 +- cpp/src/parquet/encoding_test.cc | 64 ++++++++++++++++---------------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 90011a1d73d..8aa3df5e8ac 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3181,7 +3181,7 @@ struct ByteArrayVisitor { return std::string_view{src[i]}; } - const uint32_t len(int i) const { return src[i].len; } + uint32_t len(int i) const { return src[i].len; } }; struct FLBAVisitor { @@ -3192,7 +3192,7 @@ struct FLBAVisitor { return std::string_view{reinterpret_cast(src[i].ptr), length}; } - const uint32_t len(int i) const { return length; } + uint32_t len(int i) const { return length; } }; template <> diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 99309957f8a..ec9836113a4 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1985,38 +1985,38 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; -// void InitData(int nvalues) { -// auto rand = ::arrow::random::RandomArrayGenerator(42); -// const int min_prefix_length = 0; -// const int max_prefix_length = 30; -// const size_t max_element_length = 10; -// -// ::arrow::StringBuilder builder; -// const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( -// rand.String(nvalues, /* min_length */ min_prefix_length, -// /* max_length */ max_prefix_length, /*null_percent*/ 0)); -// -// std::string previous_element; -// for (int i = 0; i < nvalues; i++) { -// auto element = prefix_array->GetString(i); -// -// if (previous_element.length() <= max_element_length) { -// previous_element = previous_element.append(element); -// } else { -// previous_element = element; -// } -// ASSERT_OK(builder.Append(previous_element)); -// } -// -// std::shared_ptr<::arrow::StringArray> array; -// ASSERT_OK(builder.Finish(&array)); -// draws_ = reinterpret_cast(array->value_data()->mutable_data()); -// } -// -// void Execute(int nvalues, int repeats) { -// InitData(nvalues); -// CheckRoundtrip(); -// } + // void InitData(int nvalues) { + // auto rand = ::arrow::random::RandomArrayGenerator(42); + // const int min_prefix_length = 0; + // const int max_prefix_length = 30; + // const size_t max_element_length = 10; + // + // ::arrow::StringBuilder builder; + // const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( + // rand.String(nvalues, /* min_length */ min_prefix_length, + // /* max_length */ max_prefix_length, /*null_percent*/ 0)); + // + // std::string previous_element; + // for (int i = 0; i < nvalues; i++) { + // auto element = prefix_array->GetString(i); + // + // if (previous_element.length() <= max_element_length) { + // previous_element = previous_element.append(element); + // } else { + // previous_element = element; + // } + // ASSERT_OK(builder.Append(previous_element)); + // } + // + // std::shared_ptr<::arrow::StringArray> array; + // ASSERT_OK(builder.Finish(&array)); + // draws_ = reinterpret_cast(array->value_data()->mutable_data()); + // } + // + // void Execute(int nvalues, int repeats) { + // InitData(nvalues); + // CheckRoundtrip(); + // } void CheckRoundtrip() override { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, From 7c028785ab8014dc218f2e7bbb25438429f5ce0f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 19 May 2023 13:00:55 +0200 Subject: [PATCH 43/78] Rename length --- cpp/src/parquet/encoding.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 8aa3df5e8ac..373888fc528 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3172,7 +3172,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { @@ -3186,13 +3186,13 @@ struct ByteArrayVisitor { struct FLBAVisitor { const FLBA* src; - const uint32_t length; + const uint32_t type_length; std::string_view operator[](int i) const { - return std::string_view{reinterpret_cast(src[i].ptr), length}; + return std::string_view{reinterpret_cast(src[i].ptr), type_length}; } - uint32_t len(int i) const { return length; } + uint32_t len(int i) const { return type_length; } }; template <> From a5be621b358f12cf35bdcc3bb68fb199eaa832be Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 26 May 2023 22:47:44 +0200 Subject: [PATCH 44/78] Update cpp/src/parquet/encoding.cc Co-authored-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 373888fc528..62d2a5aedb9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3428,7 +3428,7 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl, if (ARROW_PREDICT_FALSE(decode_byte_array[i].len != type_length)) { throw ParquetException("Fixed length byte array length mismatch"); } - buffer[i].ptr = decode_byte_array.data()->ptr + i * type_length; + buffer[i].ptr = decode_byte_array[i].ptr; } return decoded_values_size; } From e876222df1de07a5899f51babb4bf3839afd07d1 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 4 Jun 2023 01:26:16 +0200 Subject: [PATCH 45/78] ExecuteSpaced should use alternative InitData --- cpp/src/parquet/encoding_test.cc | 83 ++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ec9836113a4..bcd735e5e42 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1985,38 +1985,51 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; - // void InitData(int nvalues) { - // auto rand = ::arrow::random::RandomArrayGenerator(42); - // const int min_prefix_length = 0; - // const int max_prefix_length = 30; - // const size_t max_element_length = 10; - // - // ::arrow::StringBuilder builder; - // const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( - // rand.String(nvalues, /* min_length */ min_prefix_length, - // /* max_length */ max_prefix_length, /*null_percent*/ 0)); - // - // std::string previous_element; - // for (int i = 0; i < nvalues; i++) { - // auto element = prefix_array->GetString(i); - // - // if (previous_element.length() <= max_element_length) { - // previous_element = previous_element.append(element); - // } else { - // previous_element = element; - // } - // ASSERT_OK(builder.Append(previous_element)); - // } - // - // std::shared_ptr<::arrow::StringArray> array; - // ASSERT_OK(builder.Finish(&array)); - // draws_ = reinterpret_cast(array->value_data()->mutable_data()); - // } - // - // void Execute(int nvalues, int repeats) { - // InitData(nvalues); - // CheckRoundtrip(); - // } + void InitData(int nvalues, double null_probability) { + auto rand = ::arrow::random::RandomArrayGenerator(42); + const int min_prefix_length = 0; + const int max_prefix_length = 50; + const size_t max_element_length = 100; + + ::arrow::StringBuilder builder; + const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( + rand.String(/*size*/ nvalues, /*min_length*/ min_prefix_length, + /*max_length*/ max_prefix_length, /*null_percent*/ null_probability)); + + std::string previous_element; + for (int i = 0; i < nvalues; i++) { + auto element = prefix_array->GetString(i); + + if (previous_element.length() <= max_element_length) { + previous_element = previous_element.append(element); + } else { + previous_element = element; + } + ASSERT_OK(builder.Append(previous_element)); + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + draws_ = reinterpret_cast(array->value_data()->mutable_data()); + } + + void Execute(int nvalues, double null_probability) { + InitData(nvalues, null_probability); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, null_probability); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto valid_bits = array->null_bitmap_data(); + if (valid_bits) { + CheckRoundtripSpaced(valid_bits, valid_bits_offset); + } + } void CheckRoundtrip() override { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, @@ -2065,11 +2078,11 @@ TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); - ASSERT_NO_FATAL_FAILURE(this->Execute(250, 2)); + ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0.1)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_prob*/ 0)); + /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ 0)); - ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 200)); + ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, /*null_probability*/ 0.1)); From 744520ca983df0a6949d40a4bcbb13ce10455919 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 16 Jun 2023 16:33:55 +0200 Subject: [PATCH 46/78] Random data generator --- cpp/src/parquet/encoding_test.cc | 46 +++++++++++++++++++------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index bcd735e5e42..97ca9276f0c 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1988,29 +1988,37 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { void InitData(int nvalues, double null_probability) { auto rand = ::arrow::random::RandomArrayGenerator(42); const int min_prefix_length = 0; - const int max_prefix_length = 50; const size_t max_element_length = 100; - ::arrow::StringBuilder builder; - const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( - rand.String(/*size*/ nvalues, /*min_length*/ min_prefix_length, - /*max_length*/ max_prefix_length, /*null_percent*/ null_probability)); - - std::string previous_element; - for (int i = 0; i < nvalues; i++) { - auto element = prefix_array->GetString(i); - - if (previous_element.length() <= max_element_length) { - previous_element = previous_element.append(element); - } else { - previous_element = element; - } - ASSERT_OK(builder.Append(previous_element)); + const auto suffix_array = std::static_pointer_cast<::arrow::StringArray>(rand.String( + /*size*/ nvalues, /*min_length*/ min_prefix_length, + /*max_length*/ max_element_length, /*null_probability*/ null_probability)); + + // First prefix length is always 0, so we manually prepend a 0 to the buffer + const auto prefix_lengths = + rand.UInt8(/*size*/ std::max(nvalues - 1, 0), /*min*/ min_prefix_length, + /*max*/ max_element_length, + /*null_probability*/ null_probability); + + ::arrow::BufferBuilder sink(default_memory_pool()); + if (nvalues > 0) { + ::arrow::UInt8Builder uint8_builder; + PARQUET_THROW_NOT_OK(uint8_builder.AppendValues({uint8_t{0}})); + PARQUET_ASSIGN_OR_THROW(auto uint8_array, uint8_builder.Finish()); + auto uint8_buffer = uint8_array->data()->buffers[1]; + PARQUET_THROW_NOT_OK(sink.Append(uint8_buffer->data(), uint8_buffer->size())); } - std::shared_ptr<::arrow::StringArray> array; - ASSERT_OK(builder.Finish(&array)); - draws_ = reinterpret_cast(array->value_data()->mutable_data()); + auto prefix_lengths_buffer = prefix_lengths->data()->buffers[1]; + auto suffix_array_buffer = suffix_array->data()->buffers[1]; + + PARQUET_THROW_NOT_OK( + sink.Append(prefix_lengths_buffer->data(), prefix_lengths_buffer->size())); + PARQUET_THROW_NOT_OK( + sink.Append(suffix_array_buffer->data(), suffix_array_buffer->size())); + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(sink.Finish(&buffer, true)); + draws_ = reinterpret_cast(buffer->mutable_data()); } void Execute(int nvalues, double null_probability) { From 33d5111f3897084e40e4239eb197b77b7ab85e19 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 27 Jun 2023 21:08:07 +0200 Subject: [PATCH 47/78] Concatenation probability --- cpp/src/parquet/encoding_test.cc | 67 +++++++++++++++++--------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 97ca9276f0c..d2f59417e1a 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1988,37 +1988,40 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { void InitData(int nvalues, double null_probability) { auto rand = ::arrow::random::RandomArrayGenerator(42); const int min_prefix_length = 0; - const size_t max_element_length = 100; - - const auto suffix_array = std::static_pointer_cast<::arrow::StringArray>(rand.String( - /*size*/ nvalues, /*min_length*/ min_prefix_length, - /*max_length*/ max_element_length, /*null_probability*/ null_probability)); - - // First prefix length is always 0, so we manually prepend a 0 to the buffer - const auto prefix_lengths = - rand.UInt8(/*size*/ std::max(nvalues - 1, 0), /*min*/ min_prefix_length, - /*max*/ max_element_length, - /*null_probability*/ null_probability); - - ::arrow::BufferBuilder sink(default_memory_pool()); - if (nvalues > 0) { - ::arrow::UInt8Builder uint8_builder; - PARQUET_THROW_NOT_OK(uint8_builder.AppendValues({uint8_t{0}})); - PARQUET_ASSIGN_OR_THROW(auto uint8_array, uint8_builder.Finish()); - auto uint8_buffer = uint8_array->data()->buffers[1]; - PARQUET_THROW_NOT_OK(sink.Append(uint8_buffer->data(), uint8_buffer->size())); + const int max_prefix_length = 50; + const int max_element_length = 100; + const double prefixed_probability = 0.9; + + ::arrow::StringBuilder builder; + const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( + rand.String(/*size*/ nvalues, /*min_length*/ min_prefix_length, + /*max_length*/ max_prefix_length, /*null_percent*/ + null_probability)); + + const auto is_prefixed = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(std::max(nvalues - 1, 0), + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + + int i = 0; + std::string previous_element = ""; + + while (i < nvalues) { + const auto element = prefix_array->GetString(i); + const bool concatenate = + is_prefixed->GetView(i++) && + (element.length() + previous_element.length() <= max_element_length); + if (concatenate) { + previous_element = previous_element.append(element); + } else { + previous_element = element; + } + ASSERT_OK(builder.Append(previous_element)); } - auto prefix_lengths_buffer = prefix_lengths->data()->buffers[1]; - auto suffix_array_buffer = suffix_array->data()->buffers[1]; - - PARQUET_THROW_NOT_OK( - sink.Append(prefix_lengths_buffer->data(), prefix_lengths_buffer->size())); - PARQUET_THROW_NOT_OK( - sink.Append(suffix_array_buffer->data(), suffix_array_buffer->size())); - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(sink.Finish(&buffer, true)); - draws_ = reinterpret_cast(buffer->mutable_data()); + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + draws_ = reinterpret_cast(array->value_data()->mutable_data()); } void Execute(int nvalues, double null_probability) { @@ -2086,14 +2089,14 @@ TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); - ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0.1)); + ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ 0)); - ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - /*null_probability*/ 0.1)); + /*null_probability*/ 0)); } TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { From 9ba2e0ed96dfb503a04503bd6a98a647b6b7596b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 28 Jun 2023 01:51:07 +0200 Subject: [PATCH 48/78] Update encoding_test.cc --- cpp/src/parquet/encoding_test.cc | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index d2f59417e1a..3bb4b30a159 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1987,9 +1987,9 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { void InitData(int nvalues, double null_probability) { auto rand = ::arrow::random::RandomArrayGenerator(42); - const int min_prefix_length = 0; + const int min_prefix_length = 10; const int max_prefix_length = 50; - const int max_element_length = 100; + const int max_element_length = 1000; const double prefixed_probability = 0.9; ::arrow::StringBuilder builder; @@ -1999,17 +1999,16 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { null_probability)); const auto is_prefixed = std::dynamic_pointer_cast<::arrow::BooleanArray>( - rand.Boolean(std::max(nvalues - 1, 0), + rand.Boolean(nvalues, /*true_probability=*/prefixed_probability, /*null_probability=*/0.0)); - int i = 0; std::string previous_element = ""; + for (int i = 0; i < nvalues; i++) { + const std::string element = prefix_array->GetString(i); - while (i < nvalues) { - const auto element = prefix_array->GetString(i); const bool concatenate = - is_prefixed->GetView(i++) && + is_prefixed->GetView(i) && (element.length() + previous_element.length() <= max_element_length); if (concatenate) { previous_element = previous_element.append(element); @@ -2021,6 +2020,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { std::shared_ptr<::arrow::StringArray> array; ASSERT_OK(builder.Finish(&array)); + ASSERT_EQ(nvalues, array->length()); draws_ = reinterpret_cast(array->value_data()->mutable_data()); } @@ -2084,19 +2084,20 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { USING_BASE_MEMBERS(); }; -using TestDeltaByteArrayEncodingTypes = ::testing::Types; +using TestDeltaByteArrayEncodingTypes = ::testing::Types; TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { - ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + // ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); - ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + // 0)); ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0)); - ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + // /*null_probability*/ 0)); } TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { From bab955bedb219a96896b35d9fcc0e424335f5046 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 29 Jun 2023 23:43:01 +0200 Subject: [PATCH 49/78] Change random strning generation --- cpp/src/parquet/encoding_test.cc | 60 +++++++++++++++++--------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 3bb4b30a159..5e1b33dadff 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1986,36 +1986,40 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void InitData(int nvalues, double null_probability) { - auto rand = ::arrow::random::RandomArrayGenerator(42); - const int min_prefix_length = 10; - const int max_prefix_length = 50; + const int seed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(seed); + const int min_prefix_length = 0; + const int max_prefix_length = 2; const int max_element_length = 1000; const double prefixed_probability = 0.9; - ::arrow::StringBuilder builder; - const auto prefix_array = std::static_pointer_cast<::arrow::StringArray>( - rand.String(/*size*/ nvalues, /*min_length*/ min_prefix_length, - /*max_length*/ max_prefix_length, /*null_percent*/ - null_probability)); + const auto prefix_length_array = std::dynamic_pointer_cast<::arrow::UInt8Array>( + rand.UInt8(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto null_bitmap = prefix_length_array->null_bitmap_data(); - const auto is_prefixed = std::dynamic_pointer_cast<::arrow::BooleanArray>( + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( rand.Boolean(nvalues, /*true_probability=*/prefixed_probability, /*null_probability=*/0.0)); - std::string previous_element = ""; - for (int i = 0; i < nvalues; i++) { - const std::string element = prefix_array->GetString(i); + ::arrow::StringBuilder builder; + std::string prefix; - const bool concatenate = - is_prefixed->GetView(i) && - (element.length() + previous_element.length() <= max_element_length); - if (concatenate) { - previous_element = previous_element.append(element); + for (int i = 0; i < nvalues; i++) { + if (null_bitmap && !::arrow::bit_util::GetBit(null_bitmap, i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); } else { - previous_element = element; + std::string element = ::arrow::random_string(prefix_length_array->Value(i), seed); + const bool concatenate = + do_prefix->GetView(i) && + (prefix.length() + element.length() <= max_element_length); + if (concatenate) { + prefix = prefix + element; + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); } - ASSERT_OK(builder.Append(previous_element)); } std::shared_ptr<::arrow::StringArray> array; @@ -2088,16 +2092,16 @@ using TestDeltaByteArrayEncodingTypes = ::testing::Types; TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { - // ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); + ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); - // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ - // 0)); - - ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0)); - // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - // /*null_probability*/ 0)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + 0)); + + ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + /*null_probability*/ 0.1)); } TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { From a3b3d0ce48af82f294d2a31983f31e4bcd73e7e3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 30 Jun 2023 15:47:20 +0200 Subject: [PATCH 50/78] test --- cpp/src/parquet/encoding.cc | 9 ++-- cpp/src/parquet/encoding_test.cc | 78 +++++++++++++++++--------------- 2 files changed, 46 insertions(+), 41 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 62d2a5aedb9..16007a250f6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3082,6 +3082,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
type_length(); + uint32_t len2 = len; std::string_view last_value_view = last_value_; constexpr int kBatchSize = 256; @@ -3094,11 +3095,11 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(last_value_view.length())); + std::min(len2, static_cast(last_value_view.length())); while (k < common_length) { if (last_value_view[k] != view[k]) { break; @@ -3108,7 +3109,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { throw ParquetException("Parquet cannot store strings with size 2GB or more"); } - return std::string_view{src[i]}; + return std::string_view{reinterpret_cast(src[i].ptr), src[i].len}; } uint32_t len(int i) const { return src[i].len; } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 5e1b33dadff..b3d69959911 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1989,42 +1989,46 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { const int seed = 42; auto rand = ::arrow::random::RandomArrayGenerator(seed); const int min_prefix_length = 0; - const int max_prefix_length = 2; - const int max_element_length = 1000; - const double prefixed_probability = 0.9; - - const auto prefix_length_array = std::dynamic_pointer_cast<::arrow::UInt8Array>( - rand.UInt8(nvalues, min_prefix_length, max_prefix_length, null_probability)); - const auto null_bitmap = prefix_length_array->null_bitmap_data(); - - const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( - rand.Boolean(nvalues, - /*true_probability=*/prefixed_probability, - /*null_probability=*/0.0)); - - ::arrow::StringBuilder builder; - std::string prefix; - - for (int i = 0; i < nvalues; i++) { - if (null_bitmap && !::arrow::bit_util::GetBit(null_bitmap, i)) { - PARQUET_THROW_NOT_OK(builder.AppendNull()); - } else { - std::string element = ::arrow::random_string(prefix_length_array->Value(i), seed); - const bool concatenate = - do_prefix->GetView(i) && - (prefix.length() + element.length() <= max_element_length); - if (concatenate) { - prefix = prefix + element; - } else { - prefix = element; - } - PARQUET_THROW_NOT_OK(builder.Append(prefix)); - } - } - - std::shared_ptr<::arrow::StringArray> array; - ASSERT_OK(builder.Finish(&array)); - ASSERT_EQ(nvalues, array->length()); + const int max_prefix_length = 20; + // const int max_element_length = 1000; + // const double prefixed_probability = 0.9; + // + // const auto prefix_length_array = std::dynamic_pointer_cast<::arrow::UInt8Array>( + // rand.UInt8(nvalues, min_prefix_length, max_prefix_length, + // null_probability)); + // const auto null_bitmap = prefix_length_array->null_bitmap_data(); + // + // const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + // rand.Boolean(nvalues, + // /*true_probability=*/prefixed_probability, + // /*null_probability=*/0.0)); + // + // ::arrow::StringBuilder builder; + // std::string prefix; + // + // for (int i = 0; i < nvalues; i++) { + // if (null_bitmap && !::arrow::bit_util::GetBit(null_bitmap, i)) { + // PARQUET_THROW_NOT_OK(builder.AppendNull()); + // } else { + // std::string element = ::arrow::random_string(prefix_length_array->Value(i), + // seed); const bool concatenate = + // do_prefix->GetView(i) && + // (prefix.length() + element.length() <= max_element_length); + // if (concatenate) { + // prefix = prefix + element; + // } else { + // prefix = element; + // } + // PARQUET_THROW_NOT_OK(builder.Append(prefix)); + // } + // } + // + // std::shared_ptr<::arrow::StringArray> array; + // ASSERT_OK(builder.Finish(&array)); + // ASSERT_EQ(nvalues, array->length()); + + auto array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(0, min_prefix_length, max_prefix_length, null_probability)); draws_ = reinterpret_cast(array->value_data()->mutable_data()); } @@ -2088,7 +2092,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { USING_BASE_MEMBERS(); }; -using TestDeltaByteArrayEncodingTypes = ::testing::Types; +using TestDeltaByteArrayEncodingTypes = ::testing::Types; TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { From 18673479527eacf36871a5f9429f2a71b443fee0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 3 Jul 2023 13:11:02 +0200 Subject: [PATCH 51/78] Minor change --- cpp/src/parquet/encoding.cc | 22 +++++----- cpp/src/parquet/encoding_test.cc | 72 ++++++++++++++------------------ 2 files changed, 42 insertions(+), 52 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 16007a250f6..0b278e57f40 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2828,7 +2828,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<::arrow::bit_util::BitReader> decoder_; DeltaBitPackDecoder len_decoder_; - int num_valid_values_; + int num_valid_values_{0}; uint32_t length_idx_; std::shared_ptr buffered_length_; }; @@ -3082,7 +3082,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
type_length(); - uint32_t len2 = len; std::string_view last_value_view = last_value_; constexpr int kBatchSize = 256; @@ -3095,11 +3094,11 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(last_value_view.length())); + std::min(len, static_cast(last_value_view.length())); while (k < common_length) { if (last_value_view[k] != view[k]) { break; @@ -3109,7 +3108,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
void PutBinaryArray(const ArrayType& array) { - auto previous_len = static_cast(last_value_.size()); + auto previous_len = static_cast(last_value_.length()); std::string_view last_value_view = last_value_; PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( @@ -3137,18 +3136,19 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(j)}, 1); last_value_view = view; - const auto suffix_length = static_cast(src.len - j); + const auto suffix_length = static_cast(len - j); if (suffix_length == 0) { suffix_encoder_.Put(&kEmpty, 1); return Status::OK(); @@ -3179,7 +3179,7 @@ struct ByteArrayVisitor { if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) { throw ParquetException("Parquet cannot store strings with size 2GB or more"); } - return std::string_view{reinterpret_cast(src[i].ptr), src[i].len}; + return std::string_view{src[i]}; } uint32_t len(int i) const { return src[i].len; } @@ -3396,7 +3396,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode std::string last_value_; // string buffer for last value in previous page std::string last_value_in_previous_page_; - int num_valid_values_; + int num_valid_values_{0}; uint32_t prefix_len_offset_; std::shared_ptr buffered_prefix_length_; std::shared_ptr buffered_data_; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index b3d69959911..41f9d82455d 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1989,46 +1989,36 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { const int seed = 42; auto rand = ::arrow::random::RandomArrayGenerator(seed); const int min_prefix_length = 0; - const int max_prefix_length = 20; - // const int max_element_length = 1000; - // const double prefixed_probability = 0.9; - // - // const auto prefix_length_array = std::dynamic_pointer_cast<::arrow::UInt8Array>( - // rand.UInt8(nvalues, min_prefix_length, max_prefix_length, - // null_probability)); - // const auto null_bitmap = prefix_length_array->null_bitmap_data(); - // - // const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( - // rand.Boolean(nvalues, - // /*true_probability=*/prefixed_probability, - // /*null_probability=*/0.0)); - // - // ::arrow::StringBuilder builder; - // std::string prefix; - // - // for (int i = 0; i < nvalues; i++) { - // if (null_bitmap && !::arrow::bit_util::GetBit(null_bitmap, i)) { - // PARQUET_THROW_NOT_OK(builder.AppendNull()); - // } else { - // std::string element = ::arrow::random_string(prefix_length_array->Value(i), - // seed); const bool concatenate = - // do_prefix->GetView(i) && - // (prefix.length() + element.length() <= max_element_length); - // if (concatenate) { - // prefix = prefix + element; - // } else { - // prefix = element; - // } - // PARQUET_THROW_NOT_OK(builder.Append(prefix)); - // } - // } - // - // std::shared_ptr<::arrow::StringArray> array; - // ASSERT_OK(builder.Finish(&array)); - // ASSERT_EQ(nvalues, array->length()); - - auto array = std::dynamic_pointer_cast<::arrow::StringArray>( - rand.String(0, min_prefix_length, max_prefix_length, null_probability)); + const int max_prefix_length = 100; + const int max_element_length = 1000; + const double prefixed_probability = 0.5; + + const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( + rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( + rand.Boolean(nvalues, + /*true_probability=*/prefixed_probability, + /*null_probability=*/0.0)); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); + + std::string prefix = ""; + for (int i = 0; i < nvalues; i++) { + if (prefix_array->IsNull(i)) { + PARQUET_THROW_NOT_OK(builder.AppendNull()); + } else { + const std::string element = prefix_array->GetString(i); + if (do_prefix->Value(i) && prefix.length() < max_element_length) { + prefix = prefix.append(element); + } else { + prefix = element; + } + PARQUET_THROW_NOT_OK(builder.Append(prefix)); + } + } + + std::shared_ptr<::arrow::StringArray> array; + ASSERT_OK(builder.Finish(&array)); + num_values_ = array->length() - array->null_count(); draws_ = reinterpret_cast(array->value_data()->mutable_data()); } @@ -2105,7 +2095,7 @@ TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - /*null_probability*/ 0.1)); + /*null_probability*/ 0.5)); } TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { From bb54a8724c2803ce9780021bd4fdaa9d00575f1b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Jul 2023 02:53:06 +0200 Subject: [PATCH 52/78] Refactor DeltaByteArrayEncodingDirectPut --- cpp/src/parquet/encoding.cc | 103 ++++++++++--------------- cpp/src/parquet/encoding.h | 7 +- cpp/src/parquet/encoding_test.cc | 128 ++++++++++++++----------------- 3 files changed, 103 insertions(+), 135 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 0b278e57f40..de0db083a54 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1180,27 +1180,27 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { return max_values; } -template +template struct ArrowBinaryHelper; -template <> -struct ArrowBinaryHelper { - explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* out) { - this->out = out; - this->builder = out->builder.get(); +template +struct ArrowBinaryHelper || + std::is_same_v, + void>> { + explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* acc) { + builder = acc->builder.get(); + chunks = acc->chunks; if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, - this->builder->value_data_length(), - &this->chunk_space_remaining))) { - throw ParquetException("excess expansion in ArrowBinaryHelper"); + builder->value_data_length(), + &chunk_space_remaining))) { + throw ParquetException("excess expansion in ArrowBinaryHelper"); } - this->chunk_space_remaining = - ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); } Status PushChunk() { std::shared_ptr<::arrow::Array> result; RETURN_NOT_OK(builder->Finish(&result)); - out->chunks.push_back(std::move(result)); + chunks.push_back(std::move(result)); chunk_space_remaining = ::arrow::kBinaryMemoryLimit; return Status::OK(); } @@ -1215,51 +1215,26 @@ struct ArrowBinaryHelper { void UnsafeAppendNull() { builder->UnsafeAppendNull(); } - Status Append(const uint8_t* data, int32_t length) { - chunk_space_remaining -= length; - return builder->Append(data, length); - } + virtual Status Append(const uint8_t* data, int32_t length); Status AppendNull() { return builder->AppendNull(); } - typename EncodingTraits::Accumulator* out; - ::arrow::BinaryBuilder* builder; + typename EncodingTraits::BuilderType* builder; + std::vector> chunks; int64_t chunk_space_remaining; }; template <> -struct ArrowBinaryHelper { - explicit ArrowBinaryHelper(EncodingTraits::Accumulator* builder) { - this->builder = builder; - if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, - this->builder->value_data_length(), - &this->chunk_space_remaining))) { - throw ParquetException("excess expansion in ArrowBinaryHelper"); - } - } - - Status PushChunk() { - std::shared_ptr<::arrow::Array> result; - RETURN_NOT_OK(builder->Finish(&result)); - chunks.push_back(std::move(result)); - chunk_space_remaining = ::arrow::kBinaryMemoryLimit; - return Status::OK(); - } - - bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } - - Status Append(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - chunk_space_remaining -= length; - return builder->Append(data); - } - - Status AppendNull() { return builder->AppendNull(); } +Status ArrowBinaryHelper::Append(const uint8_t* data, int32_t length) { + chunk_space_remaining -= length; + return builder->Append(data, length); +} - ::arrow::FixedSizeBinaryBuilder* builder; - std::vector> chunks; - int64_t chunk_space_remaining; -}; +template <> +Status ArrowBinaryHelper::Append(const uint8_t* data, int32_t length) { + chunk_space_remaining -= length; + return builder->Append(data); +} template <> inline int PlainDecoder::DecodeArrow( @@ -1278,21 +1253,21 @@ inline int PlainDecoder::DecodeArrow( template <> inline int PlainDecoder::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* builder) { + typename EncodingTraits::Accumulator* acc) { int values_decoded = num_values - null_count; if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) { ParquetException::EofException(); } - PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); + PARQUET_THROW_NOT_OK(acc->builder->Reserve(num_values)); VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - builder->UnsafeAppend(data_); + acc->builder->UnsafeAppend(data_); data_ += descr_->type_length(); }, - [&]() { builder->UnsafeAppendNull(); }); + [&]() { acc->builder->UnsafeAppendNull(); }); num_values_ -= values_decoded; len_ -= descr_->type_length() * values_decoded; @@ -1347,19 +1322,19 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out) override { + typename EncodingTraits::Accumulator* acc) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, - valid_bits_offset, out, &result)); + valid_bits_offset, acc, &result)); return result; } private: Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* out, + typename EncodingTraits::Accumulator* acc, int* out_values_decoded) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(acc); int values_decoded = 0; RETURN_NOT_OK(helper.builder->Reserve(num_values)); @@ -1742,14 +1717,14 @@ int DictDecoderImpl::DecodeArrow( template <> inline int DictDecoderImpl::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* builder) { - if (builder->byte_width() != descr_->type_length()) { + typename EncodingTraits::Accumulator* acc) { + if (acc->builder->byte_width() != descr_->type_length()) { throw ParquetException("Byte width mismatch: builder was " + - std::to_string(builder->byte_width()) + " but decoder was " + - std::to_string(descr_->type_length())); + std::to_string(acc->builder->byte_width()) + + " but decoder was " + std::to_string(descr_->type_length())); } - PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); + PARQUET_THROW_NOT_OK(acc->builder->Reserve(num_values)); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1761,9 +1736,9 @@ inline int DictDecoderImpl::DecodeArrow( throw ParquetException(""); } PARQUET_THROW_NOT_OK(IndexInBounds(index)); - builder->UnsafeAppend(dict_values[index].ptr); + acc->builder->UnsafeAppend(dict_values[index].ptr); }, - [&]() { builder->UnsafeAppendNull(); }); + [&]() { acc->builder->UnsafeAppendNull(); }); return num_values - null_count; } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 9f9b740ff34..ef06844cc8d 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -140,6 +140,7 @@ template <> struct EncodingTraits { using Encoder = ByteArrayEncoder; using Decoder = ByteArrayDecoder; + using BuilderType = ::arrow::BinaryBuilder; /// \brief Internal helper class for decoding BYTE_ARRAY data where we can /// overflow the capacity of a single arrow::BinaryArray @@ -155,9 +156,13 @@ template <> struct EncodingTraits { using Encoder = FLBAEncoder; using Decoder = FLBADecoder; + using BuilderType = ::arrow::FixedSizeBinaryBuilder; + struct Accumulator { + std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder; + std::vector> chunks; + }; using ArrowType = ::arrow::FixedSizeBinaryType; - using Accumulator = ::arrow::FixedSizeBinaryBuilder; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>; }; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 41f9d82455d..cab6552f536 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -916,7 +916,7 @@ std::shared_ptr<::arrow::Array> EncodingAdHocTyped::GetValues(int seed } using EncodingAdHocTypedCases = - ::testing::Types; + ::testing::Types; TYPED_TEST_SUITE(EncodingAdHocTyped, EncodingAdHocTypedCases); @@ -2087,101 +2087,89 @@ TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0)); - ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); - ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ - 0)); + // TODO - ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); - ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - /*null_probability*/ 0.5)); + // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + // 0)); + // + // ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); + // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + // /*null_probability*/ 0.5)); } -TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { - const int64_t size = 50; - const int32_t min_length = 0; - const int32_t max_length = 10; - const int32_t num_unique = 10; - const double null_probability = 0.25; - auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); - auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); +template +class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { + public: + std::unique_ptr> encoder = + MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + std::unique_ptr> decoder = + MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); - auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) { - ASSERT_NO_THROW(encoder->Put(*values)); + void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); auto buf = encoder->FlushValues(); - int num_values = static_cast(values->length() - values->null_count()); + int num_values = static_cast(array->length() - array->null_count()); decoder->SetData(num_values, buf->data(), static_cast(buf->size())); - typename EncodingTraits::Accumulator acc; - if (::arrow::is_string(values->type()->id())) { + typename EncodingTraits::Accumulator acc; + if (::arrow::is_string(array->type()->id())) { acc.builder = std::make_unique<::arrow::StringBuilder>(); } else { acc.builder = std::make_unique<::arrow::BinaryBuilder>(); } + ASSERT_EQ(num_values, - decoder->DecodeArrow(static_cast(values->length()), - static_cast(values->null_count()), - values->null_bitmap_data(), values->offset(), &acc)); + decoder->DecodeArrow(static_cast(array->length()), + static_cast(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); std::shared_ptr<::arrow::Array> result; ASSERT_OK(acc.builder->Finish(&result)); - ASSERT_EQ(values->length(), result->length()); + ASSERT_EQ(array->length(), result->length()); ASSERT_OK(result->ValidateFull()); - ::arrow::AssertArraysEqual(*values, *result); + ::arrow::AssertArraysEqual(*array, *result); }; - ::arrow::random::RandomArrayGenerator rag(42); - auto values = rag.String(0, min_length, max_length, null_probability); - CheckSeed(values); - for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { - rag = ::arrow::random::RandomArrayGenerator(seed); - - values = rag.String(size, min_length, max_length, null_probability); - CheckSeed(values); + void CheckRoundtrip() override { + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const int32_t num_unique = 10; + const double null_probability = 0.25; + + ::arrow::random::RandomArrayGenerator rag{42}; + std::shared_ptr<::arrow::Array> values = + rag.String(0, min_length, max_length, null_probability); + CheckDirectPut(values); + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + rag = ::arrow::random::RandomArrayGenerator(seed); + values = rag.String(size, min_length, max_length, null_probability); + CheckDirectPut(values); - values = - rag.BinaryWithRepeats(size, num_unique, min_length, max_length, null_probability); - CheckSeed(values); + values = rag.BinaryWithRepeats(size, num_unique, min_length, max_length, + null_probability); + CheckDirectPut(values); + } } -} - -TEST(DeltaByteArrayEncodingAdHoc, ArrowBinaryDirectPutFixedLength) { - const int64_t size = 50; - const double null_probability = 0.25; - ::arrow::random::RandomArrayGenerator rag(0); - auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); - auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); - auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) { - ASSERT_NO_THROW(encoder->Put(*values)); - auto buf = encoder->FlushValues(); - - int num_values = static_cast(values->length() - values->null_count()); - decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + void Execute() { CheckRoundtrip(); } - typename EncodingTraits::Accumulator acc(values->type()); - ASSERT_EQ(num_values, - decoder->DecodeArrow(static_cast(values->length()), - static_cast(values->null_count()), - values->null_bitmap_data(), values->offset(), &acc)); + protected: + USING_BASE_MEMBERS(); +}; - std::shared_ptr<::arrow::Array> result; - ASSERT_OK(acc.Finish(&result)); - ASSERT_EQ(values->length(), result->length()); - ASSERT_OK(result->ValidateFull()); - ::arrow::AssertArraysEqual(*values, *result); - }; +using DeltaByteArrayEncodingDirectPutTypes = + ::testing::Types; // TODO: FLBAType +TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, DeltaByteArrayEncodingDirectPutTypes); - for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { - for (auto length : {0, 10, 100, 1000}) { - rag = ::arrow::random::RandomArrayGenerator(seed); - auto values = rag.FixedSizeBinary(size, length, null_probability); - CheckSeed(values); - } - } +TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { + ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); } TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { From 73316cf5af4821df1b410f1c258f3898c379fe81 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Jul 2023 03:12:14 +0200 Subject: [PATCH 53/78] Update cpp/src/parquet/encoding.cc Co-authored-by: Gang Wu --- cpp/src/parquet/encoding.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index de0db083a54..758f65a4b44 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3009,6 +3009,8 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { // ---------------------------------------------------------------------- // DeltaByteArrayEncoder +constexpr std::string_view kEmpty = ""; + template class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder { public: From ba4538bd38e80dccd437835cf4691c28a9ed33f8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Jul 2023 03:37:38 +0200 Subject: [PATCH 54/78] Review feedback --- cpp/src/parquet/encoding.cc | 35 +++++++++++++++++--------------- cpp/src/parquet/encoding_test.cc | 4 ++-- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 758f65a4b44..6738aeb6fea 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2560,7 +2560,7 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY, pool = ::arrow::default_memory_pool()), sink_(pool), - length_encoder_(nullptr, pool), + length_encoder_(descr, pool), encoded_size_{0} {} std::shared_ptr FlushValues() override; @@ -2682,7 +2682,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), - len_decoder_(nullptr, pool), + len_decoder_(descr, pool), buffered_length_(AllocateBuffer(pool, 0)) {} void SetData(int num_values, const uint8_t* data, int len) override { @@ -2804,7 +2804,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, std::shared_ptr<::arrow::bit_util::BitReader> decoder_; DeltaBitPackDecoder len_decoder_; int num_valid_values_{0}; - uint32_t length_idx_; + uint32_t length_idx_{0}; std::shared_ptr buffered_length_; }; @@ -3009,10 +3009,10 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { // ---------------------------------------------------------------------- // DeltaByteArrayEncoder -constexpr std::string_view kEmpty = ""; - template class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder { + static constexpr std::string_view kEmpty = ""; + public: using T = typename DType::c_type; @@ -3020,10 +3020,11 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(""))) {} + empty_(static_cast(kEmpty.size()), + reinterpret_cast(kEmpty.data())) {} std::shared_ptr FlushValues() override; @@ -3070,8 +3071,9 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(len - j); if (suffix_length == 0) { - suffix_encoder_.Put(&kEmpty, 1); + suffix_encoder_.Put(&empty_, 1); return Status::OK(); } const uint8_t* suffix_ptr = src.ptr + j; @@ -3145,11 +3147,12 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
prefix_length_encoder_; DeltaLengthByteArrayEncoder suffix_encoder_; std::string last_value_; - const ByteArray kEmpty; + const ByteArray empty_; }; struct ByteArrayVisitor { const ByteArray* src; + // type_length is not used and only here to match the FLBAVisitor const uint32_t type_length; std::string_view operator[](int i) const { @@ -3224,8 +3227,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), pool_(pool), - prefix_len_decoder_(nullptr, pool), - suffix_decoder_(nullptr, pool), + prefix_len_decoder_(descr, pool), + suffix_decoder_(descr, pool), last_value_in_previous_page_(""), buffered_prefix_length_(AllocateBuffer(pool, 0)), buffered_data_(AllocateBuffer(pool, 0)) {} @@ -3374,7 +3377,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode // string buffer for last value in previous page std::string last_value_in_previous_page_; int num_valid_values_{0}; - uint32_t prefix_len_offset_; + uint32_t prefix_len_offset_{0}; std::shared_ptr buffered_prefix_length_; std::shared_ptr buffered_data_; }; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index cab6552f536..254f1b4127e 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2018,7 +2018,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { std::shared_ptr<::arrow::StringArray> array; ASSERT_OK(builder.Finish(&array)); - num_values_ = array->length() - array->null_count(); + num_values_ = static_cast(array->length() - array->null_count()); draws_ = reinterpret_cast(array->value_data()->mutable_data()); } @@ -2133,7 +2133,7 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { ASSERT_OK(result->ValidateFull()); ::arrow::AssertArraysEqual(*array, *result); - }; + } void CheckRoundtrip() override { const int64_t size = 50; From bc7fcde17d66e231047b2820ae64c127b72e4901 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Jul 2023 14:07:45 +0200 Subject: [PATCH 55/78] Review feedback --- cpp/src/parquet/encoding.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6738aeb6fea..ed4afcd856e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2560,7 +2560,7 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY, pool = ::arrow::default_memory_pool()), sink_(pool), - length_encoder_(descr, pool), + length_encoder_(nullptr, pool), encoded_size_{0} {} std::shared_ptr FlushValues() override; @@ -2682,7 +2682,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr, MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), - len_decoder_(descr, pool), + len_decoder_(nullptr, pool), buffered_length_(AllocateBuffer(pool, 0)) {} void SetData(int num_values, const uint8_t* data, int len) override { @@ -3020,7 +3020,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(kEmpty.size()), @@ -3227,8 +3227,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), pool_(pool), - prefix_len_decoder_(descr, pool), - suffix_decoder_(descr, pool), + prefix_len_decoder_(nullptr, pool), + suffix_decoder_(nullptr, pool), last_value_in_previous_page_(""), buffered_prefix_length_(AllocateBuffer(pool, 0)), buffered_data_(AllocateBuffer(pool, 0)) {} From 3dc32b3d4fda68aaca86a1d21a1086034ad3bf0b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Jul 2023 17:49:56 +0200 Subject: [PATCH 56/78] Apply suggestions from code review Co-authored-by: Gang Wu --- cpp/src/parquet/encoding_test.cc | 86 ++++++++++++++++---------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 254f1b4127e..46d1759d585 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1986,18 +1986,18 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void InitData(int nvalues, double null_probability) { - const int seed = 42; - auto rand = ::arrow::random::RandomArrayGenerator(seed); - const int min_prefix_length = 0; - const int max_prefix_length = 100; - const int max_element_length = 1000; - const double prefixed_probability = 0.5; + constexpr int kMinPrefixLength = 0; + constexpr int kMaxPrefixLength = 100; + constexpr int kMaxElementLength = 1000; + constexpr double kPrefixedProbability = 0.5; + constexpr int kSeed = 42; + auto rand = ::arrow::random::RandomArrayGenerator(kSeed); const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( - rand.String(nvalues, min_prefix_length, max_prefix_length, null_probability)); + rand.String(nvalues, kMinPrefixLength, kMaxPrefixLength, null_probability)); const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( rand.Boolean(nvalues, - /*true_probability=*/prefixed_probability, + /*true_probability=*/kPrefixedProbability, /*null_probability=*/0.0)); ::arrow::StringBuilder builder(::arrow::default_memory_pool()); @@ -2007,7 +2007,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { PARQUET_THROW_NOT_OK(builder.AppendNull()); } else { const std::string element = prefix_array->GetString(i); - if (do_prefix->Value(i) && prefix.length() < max_element_length) { + if (do_prefix->Value(i) && prefix.length() < kMaxElementLength) { prefix = prefix.append(element); } else { prefix = element; @@ -2033,7 +2033,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { int64_t size = num_values_ + valid_bits_offset; auto rand = ::arrow::random::RandomArrayGenerator(1923); - const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto array = rand.UInt8(size, /*min=*/0, /*max=*/100, null_probability); const auto valid_bits = array->null_bitmap_data(); if (valid_bits) { CheckRoundtripSpaced(valid_bits, valid_bits_offset); @@ -2136,24 +2136,24 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { } void CheckRoundtrip() override { - const int64_t size = 50; - const int32_t min_length = 0; - const int32_t max_length = 10; - const int32_t num_unique = 10; - const double null_probability = 0.25; - - ::arrow::random::RandomArrayGenerator rag{42}; + constexpr int64_t kSize = 50; + constexpr int32_t kMinLength = 0; + constexpr int32_t kMaxLength = 10; + constexpr int32_t kNumUnique = 10; + constexpr double kNullProbability = 0.25; + constexpr int kSeed = 42; + ::arrow::random::RandomArrayGenerator rag{kSeed}; std::shared_ptr<::arrow::Array> values = - rag.String(0, min_length, max_length, null_probability); + rag.String(0, kMinLength, kMaxLength, kNullProbability); CheckDirectPut(values); for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { rag = ::arrow::random::RandomArrayGenerator(seed); - values = rag.String(size, min_length, max_length, null_probability); + values = rag.String(kSize, kMinLength, kMaxLength, kNullProbability); CheckDirectPut(values); - values = rag.BinaryWithRepeats(size, num_unique, min_length, max_length, - null_probability); + values = rag.BinaryWithRepeats(kSize, kNumUnique, kMinLength, kMaxLength, + kNullProbability); CheckDirectPut(values); } } @@ -2173,15 +2173,15 @@ TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { } TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { - auto CheckEncode = [](std::shared_ptr<::arrow::Array> values, - std::shared_ptr encoded) { + auto CheckEncode = [](const std::shared_ptr<::arrow::Array>& values, + const std::shared_ptr& encoded) { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); ASSERT_NO_THROW(encoder->Put(*values)); auto buf = encoder->FlushValues(); ASSERT_TRUE(encoded->Equals(*buf)); }; - auto arrayToI32 = [](const std::shared_ptr<::arrow::Array>& lengths) { + auto ArrayToInt32Vector = [](const std::shared_ptr<::arrow::Array>& lengths) { std::vector arrays; auto data_ptr = checked_cast<::arrow::Int32Array*>(lengths.get()); for (int i = 0; i < lengths->length(); ++i) { @@ -2217,25 +2217,25 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { ::arrow::AssertArraysEqual(*values, *upcast_result); }; - auto CheckEncodeDecode = [&](std::string_view values, - std::shared_ptr<::arrow::Array> prefix_lengths, - std::shared_ptr<::arrow::Array> suffix_lengths, - std::string_view suffix_data) { - auto encoded = ::arrow::ConcatenateBuffers({DeltaEncode(arrayToI32(prefix_lengths)), - DeltaEncode(arrayToI32(suffix_lengths)), - std::make_shared(suffix_data)}) - .ValueOrDie(); - - CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), encoded); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), encoded); - CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), encoded); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), encoded); - - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::utf8(), values)); - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_utf8(), values)); - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::binary(), values)); - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); - }; + auto CheckEncodeDecode = + [&](std::string_view values, std::shared_ptr<::arrow::Array> prefix_lengths, + std::shared_ptr<::arrow::Array> suffix_lengths, std::string_view suffix_data) { + auto encoded = + ::arrow::ConcatenateBuffers({DeltaEncode(ArrayToInt32Vector(prefix_lengths)), + DeltaEncode(ArrayToInt32Vector(suffix_lengths)), + std::make_shared(suffix_data)}) + .ValueOrDie(); + + CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), encoded); + + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::utf8(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_utf8(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::binary(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); + }; { auto values = R"(["axis", "axle", "babble", "babyhood"])"; From 51ff60a399b6564485350e43cd6f2d50a77fd182 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 8 Jul 2023 18:18:10 +0200 Subject: [PATCH 57/78] Repeats --- cpp/src/parquet/encoding_test.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 46d1759d585..c7cb33ecbd4 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1985,7 +1985,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; - void InitData(int nvalues, double null_probability) { + void InitData(int nvalues, int repeats, double null_probability) { constexpr int kMinPrefixLength = 0; constexpr int kMaxPrefixLength = 100; constexpr int kMaxElementLength = 1000; @@ -2022,14 +2022,14 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { draws_ = reinterpret_cast(array->value_data()->mutable_data()); } - void Execute(int nvalues, double null_probability) { - InitData(nvalues, null_probability); + void Execute(int nvalues, int repeats, double null_probability) { + InitData(nvalues, repeats, null_probability); CheckRoundtrip(); } void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, double null_probability) { - InitData(nvalues, null_probability); + InitData(nvalues, repeats, null_probability); int64_t size = num_values_ + valid_bits_offset; auto rand = ::arrow::random::RandomArrayGenerator(1923); @@ -2086,7 +2086,8 @@ using TestDeltaByteArrayEncodingTypes = ::testing::TypesExecute(0, 0)); + // TODO: repeats + ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/ 0, 0)); // TODO // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); From 1d2fa5f1412e2671312ee8a5bf796fa02b28d3f6 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 9 Jul 2023 02:31:39 +0200 Subject: [PATCH 58/78] Enable DirectPut tests --- cpp/src/parquet/encoding_test.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index c7cb33ecbd4..4988ff16b48 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2087,7 +2087,7 @@ TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { // TODO: repeats - ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/ 0, 0)); + ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0, 0)); // TODO // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); @@ -2117,10 +2117,16 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { decoder->SetData(num_values, buf->data(), static_cast(buf->size())); typename EncodingTraits::Accumulator acc; - if (::arrow::is_string(array->type()->id())) { + using BuilderType = typename EncodingTraits::BuilderType; + if constexpr (std::is_same_v) { acc.builder = std::make_unique<::arrow::StringBuilder>(); - } else { + } else if constexpr (std::is_same_v) { acc.builder = std::make_unique<::arrow::BinaryBuilder>(); + } else if constexpr (std::is_same_v) { + acc.builder = std::make_unique<::arrow::FixedSizeBinaryBuilder>( + array->type(), default_memory_pool()); + } else { + acc.builder = std::make_unique(); } ASSERT_EQ(num_values, @@ -2165,9 +2171,7 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { USING_BASE_MEMBERS(); }; -using DeltaByteArrayEncodingDirectPutTypes = - ::testing::Types; // TODO: FLBAType -TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, DeltaByteArrayEncodingDirectPutTypes); +TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes); TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); From b53e84e8b6b3952231249b25279a2c6d1fb52c1c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 9 Jul 2023 03:35:33 +0200 Subject: [PATCH 59/78] GeneratePrefixedData --- cpp/src/parquet/encoding_test.cc | 50 ++++++++++---------------------- cpp/src/parquet/test_util.cc | 28 ++++++++++++++++++ cpp/src/parquet/test_util.h | 11 +++++++ 3 files changed, 55 insertions(+), 34 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 4988ff16b48..c6269df31a9 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1986,40 +1986,19 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void InitData(int nvalues, int repeats, double null_probability) { - constexpr int kMinPrefixLength = 0; - constexpr int kMaxPrefixLength = 100; - constexpr int kMaxElementLength = 1000; - constexpr double kPrefixedProbability = 0.5; - constexpr int kSeed = 42; - auto rand = ::arrow::random::RandomArrayGenerator(kSeed); - - const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>( - rand.String(nvalues, kMinPrefixLength, kMaxPrefixLength, null_probability)); - const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>( - rand.Boolean(nvalues, - /*true_probability=*/kPrefixedProbability, - /*null_probability=*/0.0)); - ::arrow::StringBuilder builder(::arrow::default_memory_pool()); - - std::string prefix = ""; - for (int i = 0; i < nvalues; i++) { - if (prefix_array->IsNull(i)) { - PARQUET_THROW_NOT_OK(builder.AppendNull()); - } else { - const std::string element = prefix_array->GetString(i); - if (do_prefix->Value(i) && prefix.length() < kMaxElementLength) { - prefix = prefix.append(element); - } else { - prefix = element; - } - PARQUET_THROW_NOT_OK(builder.Append(prefix)); + num_values_ = nvalues * repeats; + input_bytes_.resize(num_values_ * sizeof(c_type)); + output_bytes_.resize(num_values_ * sizeof(c_type)); + draws_ = reinterpret_cast(input_bytes_.data()); + decode_buf_ = reinterpret_cast(output_bytes_.data()); + GeneratePrefixedData(nvalues, draws_, &data_buffer_); + + // add some repeated values + for (int j = 1; j < repeats; ++j) { + for (int i = 0; i < nvalues; ++i) { + draws_[nvalues * j + i] = draws_[i]; } } - - std::shared_ptr<::arrow::StringArray> array; - ASSERT_OK(builder.Finish(&array)); - num_values_ = static_cast(array->length() - array->null_count()); - draws_ = reinterpret_cast(array->value_data()->mutable_data()); } void Execute(int nvalues, int repeats, double null_probability) { @@ -2080,9 +2059,12 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { protected: USING_BASE_MEMBERS(); + std::vector input_bytes_; + std::vector output_bytes_; }; -using TestDeltaByteArrayEncodingTypes = ::testing::Types; +using TestDeltaByteArrayEncodingTypes = + ::testing::Types; // TODO: FLBAType TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { @@ -2090,7 +2072,7 @@ TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0, 0)); // TODO - // ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0)); + ASSERT_NO_FATAL_FAILURE(this->Execute(250, 0, /*null_probability*/ 0)); // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ // 0)); diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index 9d104618bfd..7f9971a3aeb 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -132,5 +132,33 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m random_byte_array(n, seed, buf, out, 0, max_size); } +void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, + int min_size, int max_size) { + constexpr double kPrefixedProbability = 0.5; + std::default_random_engine gen(seed); + std::uniform_int_distribution d1(min_size, max_size); + std::uniform_int_distribution d2(0, 255); + std::uniform_real_distribution d3(0.0, 1.0); + for (int i = 0; i < n; ++i) { + int len = d1(gen); + out[i].len = len; + out[i].ptr = buf; + int idx = 0; + + bool do_prefix = d3(gen) < kPrefixedProbability && i > 0; + if (do_prefix) { + std::uniform_int_distribution d4(min_size, len); + int prefix_len = d4(gen); + for (; idx < prefix_len; ++idx) { + buf[idx] = buf[idx - 1]; + } + } + for (; idx < len; ++idx) { + buf[idx] = static_cast(d2(gen)); + } + buf += len; + } +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index b0aafa037ea..9c6f8342b1c 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -155,6 +155,9 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size); +void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, + int min_size, int max_size); + template std::shared_ptr EncodeValues(Encoding::type encoding, bool use_dictionary, const Sequence& values, int length, @@ -783,6 +786,14 @@ inline void GenerateData(int num_values, ByteArray* out, random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); } +template +inline void GeneratePrefixedData(int num_values, T* out, std::vector* heap) { + // seed the prng so failure is deterministic + int max_byte_array_len = 12; + heap->resize(num_values * max_byte_array_len); + prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); +} + static constexpr int kGenerateDataFLBALength = 8; template <> From 8911b5eb8d0f043cbaf44e7c16655e012abf6ce8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 11 Jul 2023 21:54:10 +0200 Subject: [PATCH 60/78] Work --- cpp/src/parquet/encoding_test.cc | 42 +++++++++++++++----------------- cpp/src/parquet/test_util.cc | 12 +++++++++ cpp/src/parquet/test_util.h | 19 +++++++++++++++ 3 files changed, 50 insertions(+), 23 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index c6269df31a9..5b95607f7df 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1985,7 +1985,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; - void InitData(int nvalues, int repeats, double null_probability) { + void InitData(int nvalues, int repeats) { num_values_ = nvalues * repeats; input_bytes_.resize(num_values_ * sizeof(c_type)); output_bytes_.resize(num_values_ * sizeof(c_type)); @@ -2001,14 +2001,14 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { } } - void Execute(int nvalues, int repeats, double null_probability) { - InitData(nvalues, repeats, null_probability); + void Execute(int nvalues, int repeats) { + InitData(nvalues, repeats); CheckRoundtrip(); } void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, double null_probability) { - InitData(nvalues, repeats, null_probability); + InitData(nvalues, repeats); int64_t size = num_values_ + valid_bits_offset; auto rand = ::arrow::random::RandomArrayGenerator(1923); @@ -2064,23 +2064,19 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { }; using TestDeltaByteArrayEncodingTypes = - ::testing::Types; // TODO: FLBAType + ::testing::Types; // TODO: FLBAType TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { - // TODO: repeats - ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0, 0)); - // TODO - - ASSERT_NO_FATAL_FAILURE(this->Execute(250, 0, /*null_probability*/ 0)); - // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - // /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ - // 0)); - // - // ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1)); - // ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( - // /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - // /*null_probability*/ 0.5)); + ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0)); + ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5)); + ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ + 0)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( + /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, + /*null_probability*/ 0.5)); } template @@ -2153,11 +2149,11 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { USING_BASE_MEMBERS(); }; -TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes); - -TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { - ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); -} +// TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes); +// +// TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { +// ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); +// } TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto CheckEncode = [](const std::shared_ptr<::arrow::Array>& values, diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index 7f9971a3aeb..72baafb3923 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -160,5 +160,17 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o } } +void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) { + std::default_random_engine gen(seed); + std::uniform_int_distribution d(0, 255); + for (int i = 0; i < n; ++i) { + out[i].ptr = buf; + for (int j = 0; j < len; ++j) { + buf[j] = static_cast(d(gen)); + } + buf += len; + } +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 9c6f8342b1c..814136cab2a 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -158,6 +158,8 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, int max_size); +void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out); + template std::shared_ptr EncodeValues(Encoding::type encoding, bool use_dictionary, const Sequence& values, int length, @@ -794,8 +796,25 @@ inline void GeneratePrefixedData(int num_values, T* out, std::vector* h prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); } +template <> +inline void GeneratePrefixedData(int num_values, ByteArray* out, + std::vector* heap) { + // seed the prng so failure is deterministic + int max_byte_array_len = 12; + heap->resize(num_values * max_byte_array_len); + prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); +} + static constexpr int kGenerateDataFLBALength = 8; +template <> +inline void GeneratePrefixedData(int num_values, FLBA* out, + std::vector* heap) { + // seed the prng so failure is deterministic + heap->resize(num_values * kGenerateDataFLBALength); + prefixed_random_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out); +} + template <> inline void GenerateData(int num_values, FLBA* out, std::vector* heap) { // seed the prng so failure is deterministic From 2538ab89367ccf4bfa74a997be6e3442348900a5 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 12 Jul 2023 03:26:31 +0200 Subject: [PATCH 61/78] Enable DeltaByteArrayEncodingDirectPut sans FLBAType --- cpp/src/parquet/encoding_test.cc | 55 +++++++++++++------------------- cpp/src/parquet/test_util.cc | 47 +++++++++++++++------------ cpp/src/parquet/test_util.h | 23 ++++++++----- 3 files changed, 64 insertions(+), 61 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 5b95607f7df..c9b473f25b8 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1985,13 +1985,13 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; - void InitData(int nvalues, int repeats) { + void InitData(int nvalues, int repeats, double prefixed_probability) { num_values_ = nvalues * repeats; input_bytes_.resize(num_values_ * sizeof(c_type)); output_bytes_.resize(num_values_ * sizeof(c_type)); draws_ = reinterpret_cast(input_bytes_.data()); decode_buf_ = reinterpret_cast(output_bytes_.data()); - GeneratePrefixedData(nvalues, draws_, &data_buffer_); + GeneratePrefixedData(nvalues, draws_, &data_buffer_, prefixed_probability); // add some repeated values for (int j = 1; j < repeats; ++j) { @@ -2001,14 +2001,14 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { } } - void Execute(int nvalues, int repeats) { - InitData(nvalues, repeats); + void Execute(int nvalues, int repeats, double prefixed_probability) { + InitData(nvalues, repeats, prefixed_probability); CheckRoundtrip(); } void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, - double null_probability) { - InitData(nvalues, repeats); + double null_probability, double prefixed_probability) { + InitData(nvalues, repeats, prefixed_probability); int64_t size = num_values_ + valid_bits_offset; auto rand = ::arrow::random::RandomArrayGenerator(1923); @@ -2063,20 +2063,19 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { std::vector output_bytes_; }; -using TestDeltaByteArrayEncodingTypes = - ::testing::Types; // TODO: FLBAType +using TestDeltaByteArrayEncodingTypes = ::testing::Types; TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes); TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { - ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0)); - ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5)); - ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1)); + ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0, /*prefixed_probability=*/0.1)); + ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5, 0.2)); + ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1, 0.3)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ - 0)); + 0, 0.4)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - /*null_probability*/ 0.5)); + /*null_probability*/ 0.5, 0.5)); } template @@ -2096,16 +2095,7 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { typename EncodingTraits::Accumulator acc; using BuilderType = typename EncodingTraits::BuilderType; - if constexpr (std::is_same_v) { - acc.builder = std::make_unique<::arrow::StringBuilder>(); - } else if constexpr (std::is_same_v) { - acc.builder = std::make_unique<::arrow::BinaryBuilder>(); - } else if constexpr (std::is_same_v) { - acc.builder = std::make_unique<::arrow::FixedSizeBinaryBuilder>( - array->type(), default_memory_pool()); - } else { - acc.builder = std::make_unique(); - } + acc.builder = std::make_unique(array->type(), default_memory_pool()); ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast(array->length()), @@ -2128,15 +2118,12 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { constexpr double kNullProbability = 0.25; constexpr int kSeed = 42; ::arrow::random::RandomArrayGenerator rag{kSeed}; - std::shared_ptr<::arrow::Array> values = - rag.String(0, kMinLength, kMaxLength, kNullProbability); + std::shared_ptr<::arrow::Array> values = rag.BinaryWithRepeats( + /*size=*/0, /*unique=*/0, kMinLength, kMaxLength, kNullProbability); CheckDirectPut(values); for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { rag = ::arrow::random::RandomArrayGenerator(seed); - values = rag.String(kSize, kMinLength, kMaxLength, kNullProbability); - CheckDirectPut(values); - values = rag.BinaryWithRepeats(kSize, kNumUnique, kMinLength, kMaxLength, kNullProbability); CheckDirectPut(values); @@ -2149,11 +2136,13 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { USING_BASE_MEMBERS(); }; -// TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes); -// -// TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { -// ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); -// } +using TestDeltaByteArrayEncodingTypes2 = + ::testing::Types; // TODO FLBAType +TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes2); + +TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { + ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); +} TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { auto CheckEncode = [](const std::shared_ptr<::arrow::Array>& values, diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index 72baafb3923..fe52e314e83 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -116,7 +116,7 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m int max_size) { std::default_random_engine gen(seed); std::uniform_int_distribution d1(min_size, max_size); - std::uniform_int_distribution d2(0, 255); + std::uniform_int_distribution d2(int{0}, int{255}); for (int i = 0; i < n; ++i) { int len = d1(gen); out[i].len = len; @@ -133,40 +133,47 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m } void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, - int min_size, int max_size) { - constexpr double kPrefixedProbability = 0.5; + int min_size, int max_size, double prefixed_probability) { std::default_random_engine gen(seed); std::uniform_int_distribution d1(min_size, max_size); - std::uniform_int_distribution d2(0, 255); - std::uniform_real_distribution d3(0.0, 1.0); + std::uniform_int_distribution d2(int{0}, int{255}); + std::uniform_real_distribution d3(double{0}, double{1}); + for (int i = 0; i < n; ++i) { int len = d1(gen); out[i].len = len; out[i].ptr = buf; - int idx = 0; - - bool do_prefix = d3(gen) < kPrefixedProbability && i > 0; - if (do_prefix) { - std::uniform_int_distribution d4(min_size, len); - int prefix_len = d4(gen); - for (; idx < prefix_len; ++idx) { - buf[idx] = buf[idx - 1]; - } + + bool do_prefix = d3(gen) < prefixed_probability && i > 0; + std::uniform_int_distribution d4(std::min(min_size, len), len); + int prefix_len = do_prefix ? d4(gen) : 0; + for (int j = 0; j < prefix_len; ++j) { + buf[j] = buf[j - 1]; } - for (; idx < len; ++idx) { - buf[idx] = static_cast(d2(gen)); + for (int j = prefix_len; j < len; ++j) { + buf[j] = d2(gen); } buf += len; } } -void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) { +void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out, + double prefixed_probability) { std::default_random_engine gen(seed); - std::uniform_int_distribution d(0, 255); + std::uniform_int_distribution d1(int{0}, int{255}); + std::uniform_real_distribution d2(double{0}, double{1}); + std::uniform_int_distribution d3(std::min(2, len), len); + for (int i = 0; i < n; ++i) { out[i].ptr = buf; - for (int j = 0; j < len; ++j) { - buf[j] = static_cast(d(gen)); + + bool do_prefix = d2(gen) < prefixed_probability && i > 0; + int prefix_len = do_prefix ? d3(gen) : 0; + for (int j = 0; j < prefix_len; ++j) { + buf[j] = buf[j - 1]; + } + for (int j = prefix_len; j < len; ++j) { + buf[j] = d1(gen); } buf += len; } diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 814136cab2a..7738fe041f7 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -156,9 +156,10 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size); void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, - int min_size, int max_size); + int min_size, int max_size, double prefixed_probability); -void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out); +void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out, + double prefixed_probability); template std::shared_ptr EncodeValues(Encoding::type encoding, bool use_dictionary, @@ -789,30 +790,36 @@ inline void GenerateData(int num_values, ByteArray* out, } template -inline void GeneratePrefixedData(int num_values, T* out, std::vector* heap) { +inline void GeneratePrefixedData(int num_values, T* out, std::vector* heap, + double prefixed_probability) { // seed the prng so failure is deterministic int max_byte_array_len = 12; heap->resize(num_values * max_byte_array_len); - prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); + prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len, + prefixed_probability); } template <> inline void GeneratePrefixedData(int num_values, ByteArray* out, - std::vector* heap) { + std::vector* heap, + double prefixed_probability) { // seed the prng so failure is deterministic int max_byte_array_len = 12; heap->resize(num_values * max_byte_array_len); - prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); + prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len, + prefixed_probability); } static constexpr int kGenerateDataFLBALength = 8; template <> inline void GeneratePrefixedData(int num_values, FLBA* out, - std::vector* heap) { + std::vector* heap, + double prefixed_probability) { // seed the prng so failure is deterministic heap->resize(num_values * kGenerateDataFLBALength); - prefixed_random_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out); + prefixed_random_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out, + prefixed_probability); } template <> From c19a528029be99b804cc9df70329e6651608ff66 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 13 Jul 2023 00:20:53 +0200 Subject: [PATCH 62/78] Refactoring DeltaByteArrayEncodingDirectPut --- cpp/src/parquet/encoding_test.cc | 33 +++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index c9b473f25b8..02a84618f93 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2110,7 +2110,23 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { ::arrow::AssertArraysEqual(*array, *result); } - void CheckRoundtrip() override { + void CheckRoundtripFLBA() { + constexpr int64_t kSize = 50; + constexpr int kSeed = 42; + constexpr int kByteWidth = 4; + ::arrow::random::RandomArrayGenerator rag{kSeed}; + std::shared_ptr<::arrow::Array> values = + rag.FixedSizeBinary(/*size=*/0, /*byte_width=*/kByteWidth); + CheckDirectPut(values); + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + rag = ::arrow::random::RandomArrayGenerator(seed); + values = rag.FixedSizeBinary(kSize + seed, kByteWidth); + CheckDirectPut(values); + } + } + + void CheckRoundtripByteArray() { constexpr int64_t kSize = 50; constexpr int32_t kMinLength = 0; constexpr int32_t kMaxLength = 10; @@ -2130,15 +2146,22 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { } } - void Execute() { CheckRoundtrip(); } + void CheckRoundtrip() override { + using ArrowType = typename EncodingTraits::ArrowType; + using IsFixedSizeBinary = ::arrow::is_fixed_size_binary_type; + + if constexpr (IsFixedSizeBinary::value) { + CheckRoundtripFLBA(); + } else { + CheckRoundtripByteArray(); + } + } protected: USING_BASE_MEMBERS(); }; -using TestDeltaByteArrayEncodingTypes2 = - ::testing::Types; // TODO FLBAType -TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes2); +TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes); TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); From 74da1b70e0dca938ad71b2f90c57c31aa3bd4665 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 13 Jul 2023 00:51:09 +0200 Subject: [PATCH 63/78] random_byte_array etc --- cpp/src/parquet/encoding_test.cc | 4 ++-- cpp/src/parquet/test_util.cc | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 02a84618f93..9e892239409 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2127,7 +2127,7 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { } void CheckRoundtripByteArray() { - constexpr int64_t kSize = 50; + constexpr int64_t kSize = 500; constexpr int32_t kMinLength = 0; constexpr int32_t kMaxLength = 10; constexpr int32_t kNumUnique = 10; @@ -2135,7 +2135,7 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { constexpr int kSeed = 42; ::arrow::random::RandomArrayGenerator rag{kSeed}; std::shared_ptr<::arrow::Array> values = rag.BinaryWithRepeats( - /*size=*/0, /*unique=*/0, kMinLength, kMaxLength, kNullProbability); + /*size=*/1, /*unique=*/1, kMinLength, kMaxLength, kNullProbability); CheckDirectPut(values); for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index fe52e314e83..63ebdec305f 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -116,7 +116,7 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m int max_size) { std::default_random_engine gen(seed); std::uniform_int_distribution d1(min_size, max_size); - std::uniform_int_distribution d2(int{0}, int{255}); + std::uniform_int_distribution d2(0, 255); for (int i = 0; i < n; ++i) { int len = d1(gen); out[i].len = len; @@ -136,8 +136,8 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o int min_size, int max_size, double prefixed_probability) { std::default_random_engine gen(seed); std::uniform_int_distribution d1(min_size, max_size); - std::uniform_int_distribution d2(int{0}, int{255}); - std::uniform_real_distribution d3(double{0}, double{1}); + std::uniform_int_distribution d2(0, 255); + std::uniform_real_distribution d3(0, 1); for (int i = 0; i < n; ++i) { int len = d1(gen); @@ -145,13 +145,13 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o out[i].ptr = buf; bool do_prefix = d3(gen) < prefixed_probability && i > 0; - std::uniform_int_distribution d4(std::min(min_size, len), len); + std::uniform_int_distribution d4(min_size, len); int prefix_len = do_prefix ? d4(gen) : 0; for (int j = 0; j < prefix_len; ++j) { - buf[j] = buf[j - 1]; + buf[j] = out[i - 1].ptr[j]; } for (int j = prefix_len; j < len; ++j) { - buf[j] = d2(gen); + buf[j] = static_cast(d2(gen)); } buf += len; } @@ -160,9 +160,9 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out, double prefixed_probability) { std::default_random_engine gen(seed); - std::uniform_int_distribution d1(int{0}, int{255}); - std::uniform_real_distribution d2(double{0}, double{1}); - std::uniform_int_distribution d3(std::min(2, len), len); + std::uniform_int_distribution d1(0, 255); + std::uniform_real_distribution d2(0, 1); + std::uniform_int_distribution d3(0, len); for (int i = 0; i < n; ++i) { out[i].ptr = buf; @@ -170,10 +170,10 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLB bool do_prefix = d2(gen) < prefixed_probability && i > 0; int prefix_len = do_prefix ? d3(gen) : 0; for (int j = 0; j < prefix_len; ++j) { - buf[j] = buf[j - 1]; + buf[j] = out[i - 1].ptr[j]; } for (int j = prefix_len; j < len; ++j) { - buf[j] = d1(gen); + buf[j] = static_cast(d1(gen)); } buf += len; } From 92b457ae1cbb0f1d57ae8c5cd6c2d94bce7bad6a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 15 Jul 2023 00:32:45 +0200 Subject: [PATCH 64/78] Review feedback --- cpp/src/parquet/encoding.cc | 48 +++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index ed4afcd856e..26a0f3746c1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1189,7 +1189,7 @@ struct ArrowBinaryHelper> { explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* acc) { builder = acc->builder.get(); - chunks = acc->chunks; + chunks = &acc->chunks; if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, builder->value_data_length(), &chunk_space_remaining))) { @@ -1200,7 +1200,7 @@ struct ArrowBinaryHelper result; RETURN_NOT_OK(builder->Finish(&result)); - chunks.push_back(std::move(result)); + chunks->push_back(std::move(result)); chunk_space_remaining = ::arrow::kBinaryMemoryLimit; return Status::OK(); } @@ -1220,7 +1220,7 @@ struct ArrowBinaryHelperAppendNull(); } typename EncodingTraits::BuilderType* builder; - std::vector> chunks; + std::vector>* chunks; int64_t chunk_space_remaining; }; @@ -3020,6 +3020,8 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
type_length(); + uint32_t flba_len = descr_->type_length(); std::string_view last_value_view = last_value_; constexpr int kBatchSize = 256; std::array prefix_lengths; std::array suffixes; - auto visitor = VisitorType{src, len}; + auto visitor = VisitorType{src, flba_len}; for (int i = 0; i < num_values; i += kBatchSize) { const int batch_size = std::min(kBatchSize, num_values - i); for (int j = 0; j < batch_size; ++j) { const int idx = i + j; - auto view = visitor[idx]; - len = visitor.len(idx); + const auto view = visitor[idx]; + const uint32_t len = static_cast(view.length()); - uint32_t k = 0; - const uint32_t common_length = + uint32_t common_prefix_length = 0; + const uint32_t maximum_common_prefix_length = std::min(len, static_cast(last_value_view.length())); - while (k < common_length) { - if (last_value_view[k] != view[k]) { + while (common_prefix_length < maximum_common_prefix_length) { + if (last_value_view[common_prefix_length] != view[common_prefix_length]) { break; } - k++; + common_prefix_length++; } last_value_view = view; - prefix_lengths[j] = k; - const uint32_t suffix_length = len - k; - const uint8_t* suffix_ptr = src[idx].ptr + k; + prefix_lengths[j] = common_prefix_length; + const uint32_t suffix_length = len - common_prefix_length; + const uint8_t* suffix_ptr = src[idx].ptr + common_prefix_length; // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); @@ -3114,25 +3116,25 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(j)}, 1); + prefix_length_encoder_.Put({static_cast(common_prefix_length)}, 1); last_value_view = view; - const auto suffix_length = static_cast(len - j); + const auto suffix_length = static_cast(len - common_prefix_length); if (suffix_length == 0) { suffix_encoder_.Put(&empty_, 1); return Status::OK(); } - const uint8_t* suffix_ptr = src.ptr + j; + const uint8_t* suffix_ptr = src.ptr + common_prefix_length; // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray suffix(suffix_length, suffix_ptr); suffix_encoder_.Put(&suffix, 1); From e4b96aa634e8787b1b4c780da876a2edd1e9753e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 20 Jul 2023 12:21:39 +0200 Subject: [PATCH 65/78] Apply suggestions from code review Co-authored-by: mwish <1506118561@qq.com> Co-authored-by: Gang Wu Co-authored-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 4 ++-- cpp/src/parquet/test_util.cc | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 26a0f3746c1..9d09f64f178 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3022,7 +3022,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(kEmpty.size()), @@ -3043,7 +3043,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
memory_pool())); T* data = reinterpret_cast(buffer->mutable_data()); diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index 63ebdec305f..abf17f14071 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -135,9 +135,9 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, int max_size, double prefixed_probability) { std::default_random_engine gen(seed); - std::uniform_int_distribution d1(min_size, max_size); - std::uniform_int_distribution d2(0, 255); - std::uniform_real_distribution d3(0, 1); + std::uniform_int_distribution dist_size(min_size, max_size); + std::uniform_int_distribution dist_byte(0, 255); + std::uniform_real_distribution dist_has_prefix(0, 1); for (int i = 0; i < n; ++i) { int len = d1(gen); From 45a6d510e7380a7e2827b0be975b86275fe0af57 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 20 Jul 2023 12:57:47 +0200 Subject: [PATCH 66/78] Review feedback --- cpp/src/parquet/encoding.cc | 2 -- cpp/src/parquet/test_util.cc | 25 ++++++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9d09f64f178..2a5ecda901d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3020,8 +3020,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
#include +#include "arrow/testing/uniform_real.h" #include "parquet/column_page.h" #include "parquet/column_reader.h" #include "parquet/column_writer.h" @@ -137,21 +138,23 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o std::default_random_engine gen(seed); std::uniform_int_distribution dist_size(min_size, max_size); std::uniform_int_distribution dist_byte(0, 255); - std::uniform_real_distribution dist_has_prefix(0, 1); + std::bernoulli_distribution dist_has_prefix(prefixed_probability); + std::uniform_real_distribution dist_prefix_length(0, 1); for (int i = 0; i < n; ++i) { - int len = d1(gen); + int len = dist_size(gen); out[i].len = len; out[i].ptr = buf; - bool do_prefix = d3(gen) < prefixed_probability && i > 0; + bool do_prefix = dist_has_prefix(gen) && i > 0; std::uniform_int_distribution d4(min_size, len); - int prefix_len = do_prefix ? d4(gen) : 0; + int prefix_len = + do_prefix ? static_cast(std::ceil(len * dist_prefix_length(gen))) : 0; for (int j = 0; j < prefix_len; ++j) { buf[j] = out[i - 1].ptr[j]; } for (int j = prefix_len; j < len; ++j) { - buf[j] = static_cast(d2(gen)); + buf[j] = static_cast(dist_byte(gen)); } buf += len; } @@ -160,20 +163,20 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out, double prefixed_probability) { std::default_random_engine gen(seed); - std::uniform_int_distribution d1(0, 255); - std::uniform_real_distribution d2(0, 1); - std::uniform_int_distribution d3(0, len); + std::uniform_int_distribution dist_byte(0, 255); + std::bernoulli_distribution dist_has_prefix(prefixed_probability); + std::uniform_int_distribution dist_size(0, len); for (int i = 0; i < n; ++i) { out[i].ptr = buf; - bool do_prefix = d2(gen) < prefixed_probability && i > 0; - int prefix_len = do_prefix ? d3(gen) : 0; + bool do_prefix = dist_has_prefix(gen) && i > 0; + int prefix_len = do_prefix ? dist_size(gen) : 0; for (int j = 0; j < prefix_len; ++j) { buf[j] = out[i - 1].ptr[j]; } for (int j = prefix_len; j < len; ++j) { - buf[j] = static_cast(d1(gen)); + buf[j] = static_cast(dist_byte(gen)); } buf += len; } From 1e101496601a011e2391dccc5e9d56e97b409ee6 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 20 Jul 2023 16:44:41 +0200 Subject: [PATCH 67/78] Review feedback --- cpp/src/parquet/encoding.cc | 4 +++- cpp/src/parquet/encoding_test.cc | 41 +++++++++----------------------- cpp/src/parquet/test_util.h | 8 +------ 3 files changed, 15 insertions(+), 38 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 2a5ecda901d..e11992b2f67 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1215,7 +1215,7 @@ struct ArrowBinaryHelperUnsafeAppendNull(); } - virtual Status Append(const uint8_t* data, int32_t length); + Status Append(const uint8_t* data, int32_t length); Status AppendNull() { return builder->AppendNull(); } @@ -1226,12 +1226,14 @@ struct ArrowBinaryHelper Status ArrowBinaryHelper::Append(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); chunk_space_remaining -= length; return builder->Append(data, length); } template <> Status ArrowBinaryHelper::Append(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); chunk_space_remaining -= length; return builder->Append(data); } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 9e892239409..6637a061a3b 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1984,8 +1984,9 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { public: using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; + static constexpr double prefixed_probability = 0.5; - void InitData(int nvalues, int repeats, double prefixed_probability) { + void InitData(int nvalues, int repeats) { num_values_ = nvalues * repeats; input_bytes_.resize(num_values_ * sizeof(c_type)); output_bytes_.resize(num_values_ * sizeof(c_type)); @@ -2001,24 +2002,6 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { } } - void Execute(int nvalues, int repeats, double prefixed_probability) { - InitData(nvalues, repeats, prefixed_probability); - CheckRoundtrip(); - } - - void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, - double null_probability, double prefixed_probability) { - InitData(nvalues, repeats, prefixed_probability); - - int64_t size = num_values_ + valid_bits_offset; - auto rand = ::arrow::random::RandomArrayGenerator(1923); - const auto array = rand.UInt8(size, /*min=*/0, /*max=*/100, null_probability); - const auto valid_bits = array->null_bitmap_data(); - if (valid_bits) { - CheckRoundtripSpaced(valid_bits, valid_bits_offset); - } - } - void CheckRoundtrip() override { auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, /*use_dictionary=*/false, descr_.get()); @@ -2067,15 +2050,15 @@ using TestDeltaByteArrayEncodingTypes = ::testing::TypesExecute(0, /*repeats=*/0, /*prefixed_probability=*/0.1)); - ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5, 0.2)); - ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1, 0.3)); + ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0)); + ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5)); + ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, /*null_probability*/ - 0, 0.4)); + 0)); ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced( /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64, - /*null_probability*/ 0.5, 0.5)); + /*null_probability*/ 0.5)); } template @@ -2120,7 +2103,6 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { CheckDirectPut(values); for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { - rag = ::arrow::random::RandomArrayGenerator(seed); values = rag.FixedSizeBinary(kSize + seed, kByteWidth); CheckDirectPut(values); } @@ -2138,8 +2120,7 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { /*size=*/1, /*unique=*/1, kMinLength, kMaxLength, kNullProbability); CheckDirectPut(values); - for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { - rag = ::arrow::random::RandomArrayGenerator(seed); + for (int i = 0; i < 10; ++i) { values = rag.BinaryWithRepeats(kSize, kNumUnique, kMinLength, kMaxLength, kNullProbability); CheckDirectPut(values); @@ -2177,12 +2158,12 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { }; auto ArrayToInt32Vector = [](const std::shared_ptr<::arrow::Array>& lengths) { - std::vector arrays; + std::vector vector; auto data_ptr = checked_cast<::arrow::Int32Array*>(lengths.get()); for (int i = 0; i < lengths->length(); ++i) { - arrays.push_back(data_ptr->GetView(i)); + vector.push_back(data_ptr->GetView(i)); } - return arrays; + return vector; }; auto CheckDecode = [](std::shared_ptr buf, diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 7738fe041f7..657f5241945 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -791,13 +791,7 @@ inline void GenerateData(int num_values, ByteArray* out, template inline void GeneratePrefixedData(int num_values, T* out, std::vector* heap, - double prefixed_probability) { - // seed the prng so failure is deterministic - int max_byte_array_len = 12; - heap->resize(num_values * max_byte_array_len); - prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len, - prefixed_probability); -} + double prefixed_probability); template <> inline void GeneratePrefixedData(int num_values, ByteArray* out, From 6c6fbde160189c04187ee2906cff3d6d6f501331 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 20 Jul 2023 17:21:40 +0200 Subject: [PATCH 68/78] Review feedback --- cpp/src/parquet/encoding.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e11992b2f67..938b23c1ae1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1207,11 +1207,7 @@ struct ArrowBinaryHelperUnsafeAppend(data, length); - } + void UnsafeAppend(const uint8_t* data, int32_t length); void UnsafeAppendNull() { builder->UnsafeAppendNull(); } @@ -1238,6 +1234,13 @@ Status ArrowBinaryHelper::Append(const uint8_t* data, int32_t length) return builder->Append(data); } +template <> +void ArrowBinaryHelper::UnsafeAppend(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); + chunk_space_remaining -= length; + builder->UnsafeAppend(data, length); +} + template <> inline int PlainDecoder::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, From 6930a7918e86dbde9cdd2d8a5e80d59345d6f049 Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 21 Jul 2023 16:27:22 +0200 Subject: [PATCH 69/78] Review feedback --- cpp/src/parquet/encoding_test.cc | 54 ++++++-------------------------- 1 file changed, 10 insertions(+), 44 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 6637a061a3b..52c116bd87a 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1705,11 +1705,13 @@ class TestDeltaLengthByteArrayEncoding : public TestEncodingBase { using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; + virtual Encoding::type GetEncoding() { return Encoding::DELTA_LENGTH_BYTE_ARRAY; } + virtual void CheckRoundtrip() { - auto encoder = MakeTypedEncoder(Encoding::DELTA_LENGTH_BYTE_ARRAY, + auto encoding = GetEncoding(); + auto encoder = MakeTypedEncoder(encoding, /*use_dictionary=*/false, descr_.get()); - auto decoder = - MakeTypedDecoder(Encoding::DELTA_LENGTH_BYTE_ARRAY, descr_.get()); + auto decoder = MakeTypedDecoder(encoding, descr_.get()); encoder->Put(draws_, num_values_); encode_buffer_ = encoder->FlushValues(); @@ -1722,10 +1724,10 @@ class TestDeltaLengthByteArrayEncoding : public TestEncodingBase { } void CheckRoundtripSpaced(const uint8_t* valid_bits, int64_t valid_bits_offset) { - auto encoder = MakeTypedEncoder(Encoding::DELTA_LENGTH_BYTE_ARRAY, + auto encoding = GetEncoding(); + auto encoder = MakeTypedEncoder(encoding, /*use_dictionary=*/false, descr_.get()); - auto decoder = - MakeTypedDecoder(Encoding::DELTA_LENGTH_BYTE_ARRAY, descr_.get()); + auto decoder = MakeTypedDecoder(encoding, descr_.get()); int null_count = 0; for (auto i = 0; i < num_values_; i++) { if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { @@ -1980,7 +1982,7 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) { // DELTA_BYTE_ARRAY encode/decode tests. template -class TestDeltaByteArrayEncoding : public TestEncodingBase { +class TestDeltaByteArrayEncoding : public TestDeltaLengthByteArrayEncoding { public: using c_type = typename Type::c_type; static constexpr int TYPE = Type::type_num; @@ -2002,43 +2004,7 @@ class TestDeltaByteArrayEncoding : public TestEncodingBase { } } - void CheckRoundtrip() override { - auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, - /*use_dictionary=*/false, descr_.get()); - auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); - - encoder->Put(draws_, num_values_); - encode_buffer_ = encoder->FlushValues(); - - decoder->SetData(num_values_, encode_buffer_->data(), - static_cast(encode_buffer_->size())); - int values_decoded = decoder->Decode(decode_buf_, num_values_); - ASSERT_EQ(num_values_, values_decoded); - ASSERT_NO_FATAL_FAILURE(VerifyResults(decode_buf_, draws_, num_values_)); - } - - void CheckRoundtripSpaced(const uint8_t* valid_bits, - int64_t valid_bits_offset) override { - auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY, - /*use_dictionary=*/false, descr_.get()); - auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY, descr_.get()); - int null_count = 0; - for (auto i = 0; i < num_values_; i++) { - if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) { - null_count++; - } - } - - encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); - encode_buffer_ = encoder->FlushValues(); - decoder->SetData(num_values_ - null_count, encode_buffer_->data(), - static_cast(encode_buffer_->size())); - auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, - valid_bits, valid_bits_offset); - ASSERT_EQ(num_values_, values_decoded); - ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced(decode_buf_, draws_, num_values_, - valid_bits, valid_bits_offset)); - } + Encoding::type GetEncoding() override { return Encoding::DELTA_BYTE_ARRAY; } protected: USING_BASE_MEMBERS(); From 409a6ee7c5607426592f6741f90b6c3d68b0ba51 Mon Sep 17 00:00:00 2001 From: Rok Date: Sat, 22 Jul 2023 01:56:06 +0200 Subject: [PATCH 70/78] Review feedback --- cpp/src/parquet/encoding.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 938b23c1ae1..f518e861d2d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3047,9 +3047,9 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
memory_pool())); - T* data = reinterpret_cast(buffer->mutable_data()); + PARQUET_ASSIGN_OR_THROW( + buffer_, ::arrow::AllocateBuffer(num_values * sizeof(T), this->memory_pool())); + T* data = reinterpret_cast(buffer_->mutable_data()); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); Put(data, num_valid_values); @@ -3153,6 +3153,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
suffix_encoder_; std::string last_value_; const ByteArray empty_; + std::unique_ptr buffer_; }; struct ByteArrayVisitor { From 7dc32e1ab2200fdbf7b9ebb9a58dd8222d632d4e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 7 Aug 2023 14:46:39 +0200 Subject: [PATCH 71/78] Apply suggestions from code review Co-authored-by: mwish <1506118561@qq.com> --- cpp/src/parquet/test_util.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 657f5241945..076865f51ac 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -800,8 +800,8 @@ inline void GeneratePrefixedData(int num_values, ByteArray* out, // seed the prng so failure is deterministic int max_byte_array_len = 12; heap->resize(num_values * max_byte_array_len); - prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len, - prefixed_probability); + prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), out, /*min_size=*/2, + /*max_size=*/max_byte_array_len, prefixed_probability); } static constexpr int kGenerateDataFLBALength = 8; @@ -812,8 +812,8 @@ inline void GeneratePrefixedData(int num_values, FLBA* out, double prefixed_probability) { // seed the prng so failure is deterministic heap->resize(num_values * kGenerateDataFLBALength); - prefixed_random_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out, - prefixed_probability); + prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), + kGenerateDataFLBALength, out, prefixed_probability); } template <> From 578d7de93b07cb759a59c309925218300d9b239a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 7 Aug 2023 17:44:25 +0200 Subject: [PATCH 72/78] Switch to resizable buffer --- cpp/src/parquet/encoding.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f518e861d2d..c2633a660bf 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3047,8 +3047,13 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
memory_pool())); + if (buffer_ == nullptr) { + PARQUET_ASSIGN_OR_THROW(buffer_, + ::arrow::AllocateResizableBuffer(num_values * sizeof(T), + this->memory_pool())); + } else { + PARQUET_THROW_NOT_OK(buffer_->Resize(num_values * sizeof(T), false)); + } T* data = reinterpret_cast(buffer_->mutable_data()); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, data); @@ -3153,7 +3158,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
suffix_encoder_; std::string last_value_; const ByteArray empty_; - std::unique_ptr buffer_; + std::unique_ptr buffer_; }; struct ByteArrayVisitor { From 816777142f1e6fd4f368a2c8c7655ba574abe3a7 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 17 Aug 2023 02:37:46 +0200 Subject: [PATCH 73/78] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- cpp/src/parquet/test_util.cc | 7 +++++-- cpp/src/parquet/test_util.h | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index a093801b892..56d26994d2c 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -148,8 +148,11 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o bool do_prefix = dist_has_prefix(gen) && i > 0; std::uniform_int_distribution d4(min_size, len); - int prefix_len = - do_prefix ? static_cast(std::ceil(len * dist_prefix_length(gen))) : 0; + int prefix_len = 0; + if (do_prefix) { + int max_prefix_len = std::min(len, out[i - 1].len); + prefix_len = static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); + } for (int j = 0; j < prefix_len; ++j) { buf[j] = out[i - 1].ptr[j]; } diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 076865f51ac..57d746bd2a1 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -789,6 +789,9 @@ inline void GenerateData(int num_values, ByteArray* out, random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); } +// Generate ByteArray or FLBA data where there is a given probability +// for each value to share a common prefix with its predecessor. +// This is useful to exercise prefix-based encodings such as DELTA_BYTE_ARRAY. template inline void GeneratePrefixedData(int num_values, T* out, std::vector* heap, double prefixed_probability); @@ -797,9 +800,9 @@ template <> inline void GeneratePrefixedData(int num_values, ByteArray* out, std::vector* heap, double prefixed_probability) { - // seed the prng so failure is deterministic int max_byte_array_len = 12; heap->resize(num_values * max_byte_array_len); + // seed the prng so failure is deterministic prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), out, /*min_size=*/2, /*max_size=*/max_byte_array_len, prefixed_probability); } From 0aaa6b44900d98e79aec14d3b7203968a459592e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 17 Aug 2023 04:49:32 +0200 Subject: [PATCH 74/78] Review feedback --- cpp/src/parquet/encoding.cc | 15 ++++++--------- cpp/src/parquet/encoding_test.cc | 5 +++-- cpp/src/parquet/test_util.cc | 3 +-- cpp/src/parquet/test_util.h | 6 +++--- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c2633a660bf..dd02b6be39a 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3065,17 +3065,15 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
- void PutInternal(const T* src, int num_values) { + void PutInternal(const T* src, int num_values, const VisitorType visitor) { if (num_values == 0) { return; } - uint32_t flba_len = descr_->type_length(); std::string_view last_value_view = last_value_; constexpr int kBatchSize = 256; std::array prefix_lengths; std::array suffixes; - auto visitor = VisitorType{src, flba_len}; for (int i = 0; i < num_values; i += kBatchSize) { const int batch_size = std::min(kBatchSize, num_values - i); @@ -3083,7 +3081,7 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
(view.length()); + const auto len = static_cast(view.length()); uint32_t common_prefix_length = 0; const uint32_t maximum_common_prefix_length = @@ -3121,7 +3119,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { return Status::Invalid("Parquet cannot store strings with size 2GB or more"); } - // Convert to ByteArray, so it can be passed to the suffix_encoder_. const ByteArray src{view}; uint32_t common_prefix_length = 0; @@ -3163,8 +3160,6 @@ class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder
= kMaxByteArraySize)) { @@ -3189,12 +3184,14 @@ struct FLBAVisitor { template <> void DeltaByteArrayEncoder::Put(const ByteArray* src, int num_values) { - PutInternal(src, num_values); + auto visitor = ByteArrayVisitor{src}; + PutInternal(src, num_values, visitor); } template <> void DeltaByteArrayEncoder::Put(const FLBA* src, int num_values) { - PutInternal(src, num_values); + auto visitor = FLBAVisitor{src, static_cast(descr_->type_length())}; + PutInternal(src, num_values, visitor); } template diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 52c116bd87a..ca5679c804f 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -36,6 +36,7 @@ #include "arrow/util/bitmap_writer.h" #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" +#include "arrow/util/span.h" #include "arrow/util/string.h" #include "parquet/encoding.h" #include "parquet/platform.h" @@ -181,7 +182,7 @@ class TestEncodingBase : public ::testing::Test { void TearDown() {} - void InitData(int nvalues, int repeats) { + virtual void InitData(int nvalues, int repeats) { num_values_ = nvalues * repeats; input_bytes_.resize(num_values_ * sizeof(c_type)); output_bytes_.resize(num_values_ * sizeof(c_type)); @@ -1988,7 +1989,7 @@ class TestDeltaByteArrayEncoding : public TestDeltaLengthByteArrayEncoding static constexpr int TYPE = Type::type_num; static constexpr double prefixed_probability = 0.5; - void InitData(int nvalues, int repeats) { + void InitData(int nvalues, int repeats) override { num_values_ = nvalues * repeats; input_bytes_.resize(num_values_ * sizeof(c_type)); output_bytes_.resize(num_values_ * sizeof(c_type)); diff --git a/cpp/src/parquet/test_util.cc b/cpp/src/parquet/test_util.cc index 56d26994d2c..b65945cc732 100644 --- a/cpp/src/parquet/test_util.cc +++ b/cpp/src/parquet/test_util.cc @@ -147,10 +147,9 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o out[i].ptr = buf; bool do_prefix = dist_has_prefix(gen) && i > 0; - std::uniform_int_distribution d4(min_size, len); int prefix_len = 0; if (do_prefix) { - int max_prefix_len = std::min(len, out[i - 1].len); + int max_prefix_len = std::min(len, static_cast(out[i - 1].len)); prefix_len = static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); } for (int j = 0; j < prefix_len; ++j) { diff --git a/cpp/src/parquet/test_util.h b/cpp/src/parquet/test_util.h index 57d746bd2a1..c8578609e9b 100644 --- a/cpp/src/parquet/test_util.h +++ b/cpp/src/parquet/test_util.h @@ -783,9 +783,9 @@ inline void GenerateData(int num_values, Int96* out, std::vector template <> inline void GenerateData(int num_values, ByteArray* out, std::vector* heap) { - // seed the prng so failure is deterministic int max_byte_array_len = 12; heap->resize(num_values * max_byte_array_len); + // seed the prng so failure is deterministic random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); } @@ -813,16 +813,16 @@ template <> inline void GeneratePrefixedData(int num_values, FLBA* out, std::vector* heap, double prefixed_probability) { - // seed the prng so failure is deterministic heap->resize(num_values * kGenerateDataFLBALength); + // seed the prng so failure is deterministic prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), kGenerateDataFLBALength, out, prefixed_probability); } template <> inline void GenerateData(int num_values, FLBA* out, std::vector* heap) { - // seed the prng so failure is deterministic heap->resize(num_values * kGenerateDataFLBALength); + // seed the prng so failure is deterministic random_fixed_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out); } From 9f0cdbb461474bf90780916f638251b3a92a2d9c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 17 Aug 2023 21:28:23 +0200 Subject: [PATCH 75/78] Review feedback --- cpp/src/parquet/encoding.cc | 104 +++++++++++++++++++------------ cpp/src/parquet/encoding.h | 6 +- cpp/src/parquet/encoding_test.cc | 40 ++++++++++-- 3 files changed, 101 insertions(+), 49 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index dd02b6be39a..7d02ce1ed0f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1184,9 +1184,8 @@ template struct ArrowBinaryHelper; template -struct ArrowBinaryHelper || - std::is_same_v, - void>> { +struct ArrowBinaryHelper, void>> { explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* acc) { builder = acc->builder.get(); chunks = &acc->chunks; @@ -1200,18 +1199,26 @@ struct ArrowBinaryHelper result; RETURN_NOT_OK(builder->Finish(&result)); - chunks->push_back(std::move(result)); + chunks->push_back(result); chunk_space_remaining = ::arrow::kBinaryMemoryLimit; return Status::OK(); } bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } - void UnsafeAppend(const uint8_t* data, int32_t length); + void UnsafeAppend(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); + chunk_space_remaining -= length; + builder->UnsafeAppend(data, length); + } void UnsafeAppendNull() { builder->UnsafeAppendNull(); } - Status Append(const uint8_t* data, int32_t length); + Status Append(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); + chunk_space_remaining -= length; + return builder->Append(data, length); + } Status AppendNull() { return builder->AppendNull(); } @@ -1220,26 +1227,45 @@ struct ArrowBinaryHelper -Status ArrowBinaryHelper::Append(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - chunk_space_remaining -= length; - return builder->Append(data, length); -} +template +struct ArrowBinaryHelper, void>> { + explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* acc) { + builder = acc; + if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, + builder->value_data_length(), + &space_remaining))) { + throw ParquetException("excess expansion in ArrowBinaryHelper"); + } + } -template <> -Status ArrowBinaryHelper::Append(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - chunk_space_remaining -= length; - return builder->Append(data); -} + Status PushChunk() { + std::shared_ptr<::arrow::Array> result; + RETURN_NOT_OK(builder->Finish(&result)); + space_remaining = ::arrow::kBinaryMemoryLimit; + return Status::OK(); + } -template <> -void ArrowBinaryHelper::UnsafeAppend(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - chunk_space_remaining -= length; - builder->UnsafeAppend(data, length); -} + bool CanFit(int64_t length) const { return length <= space_remaining; } + + void UnsafeAppend(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); + space_remaining -= length; + builder->UnsafeAppend(data, length); + } + + void UnsafeAppendNull() { builder->UnsafeAppendNull(); } + + Status Append(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); + space_remaining -= length; + return builder->Append(data); + } + + Status AppendNull() { return builder->AppendNull(); } + + typename EncodingTraits::Accumulator* builder; + int64_t space_remaining; +}; template <> inline int PlainDecoder::DecodeArrow( @@ -1258,21 +1284,21 @@ inline int PlainDecoder::DecodeArrow( template <> inline int PlainDecoder::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* acc) { + typename EncodingTraits::Accumulator* builder) { int values_decoded = num_values - null_count; if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) { ParquetException::EofException(); } - PARQUET_THROW_NOT_OK(acc->builder->Reserve(num_values)); + PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - acc->builder->UnsafeAppend(data_); + builder->UnsafeAppend(data_); data_ += descr_->type_length(); }, - [&]() { acc->builder->UnsafeAppendNull(); }); + [&]() { builder->UnsafeAppendNull(); }); num_values_ -= values_decoded; len_ -= descr_->type_length() * values_decoded; @@ -1327,19 +1353,19 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* acc) override { + typename EncodingTraits::Accumulator* out) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, - valid_bits_offset, acc, &result)); + valid_bits_offset, out, &result)); return result; } private: Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* acc, + typename EncodingTraits::Accumulator* out, int* out_values_decoded) { - ArrowBinaryHelper helper(acc); + ArrowBinaryHelper helper(out); int values_decoded = 0; RETURN_NOT_OK(helper.builder->Reserve(num_values)); @@ -1722,14 +1748,14 @@ int DictDecoderImpl::DecodeArrow( template <> inline int DictDecoderImpl::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - typename EncodingTraits::Accumulator* acc) { - if (acc->builder->byte_width() != descr_->type_length()) { + typename EncodingTraits::Accumulator* builder) { + if (builder->byte_width() != descr_->type_length()) { throw ParquetException("Byte width mismatch: builder was " + - std::to_string(acc->builder->byte_width()) + - " but decoder was " + std::to_string(descr_->type_length())); + std::to_string(builder->byte_width()) + " but decoder was " + + std::to_string(descr_->type_length())); } - PARQUET_THROW_NOT_OK(acc->builder->Reserve(num_values)); + PARQUET_THROW_NOT_OK(builder->Reserve(num_values)); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1741,9 +1767,9 @@ inline int DictDecoderImpl::DecodeArrow( throw ParquetException(""); } PARQUET_THROW_NOT_OK(IndexInBounds(index)); - acc->builder->UnsafeAppend(dict_values[index].ptr); + builder->UnsafeAppend(dict_values[index].ptr); }, - [&]() { acc->builder->UnsafeAppendNull(); }); + [&]() { builder->UnsafeAppendNull(); }); return num_values - null_count; } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index ef06844cc8d..5dc5d534674 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -156,13 +156,9 @@ template <> struct EncodingTraits { using Encoder = FLBAEncoder; using Decoder = FLBADecoder; - using BuilderType = ::arrow::FixedSizeBinaryBuilder; - struct Accumulator { - std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder; - std::vector> chunks; - }; using ArrowType = ::arrow::FixedSizeBinaryType; + using Accumulator = ::arrow::FixedSizeBinaryBuilder; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>; }; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ca5679c804f..69b87f6469b 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -917,7 +917,7 @@ std::shared_ptr<::arrow::Array> EncodingAdHocTyped::GetValues(int seed } using EncodingAdHocTypedCases = - ::testing::Types; + ::testing::Types; TYPED_TEST_SUITE(EncodingAdHocTyped, EncodingAdHocTypedCases); @@ -2030,13 +2030,16 @@ TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { template class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { + using ArrowType = typename EncodingTraits::ArrowType; + using IsFixedSizeBinary = ::arrow::is_fixed_size_binary_type; + public: std::unique_ptr> encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); std::unique_ptr> decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); - void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { + void CheckDirectPutByteArray(std::shared_ptr<::arrow::Array> array) { ASSERT_NO_THROW(encoder->Put(*array)); auto buf = encoder->FlushValues(); @@ -2060,6 +2063,36 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { ::arrow::AssertArraysEqual(*array, *result); } + void CheckDirectPutFLBA(std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast(array->length() - array->null_count()); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + + auto acc = + typename EncodingTraits::Accumulator(array->type(), default_memory_pool()); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast(array->length()), + static_cast(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.Finish(&result)); + ASSERT_EQ(array->length(), result->length()); + ASSERT_OK(result->ValidateFull()); + + ::arrow::AssertArraysEqual(*array, *result); + } + + void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { + if constexpr (IsFixedSizeBinary::value) { + CheckDirectPutFLBA(array); + } else { + CheckDirectPutByteArray(array); + } + } + void CheckRoundtripFLBA() { constexpr int64_t kSize = 50; constexpr int kSeed = 42; @@ -2095,9 +2128,6 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { } void CheckRoundtrip() override { - using ArrowType = typename EncodingTraits::ArrowType; - using IsFixedSizeBinary = ::arrow::is_fixed_size_binary_type; - if constexpr (IsFixedSizeBinary::value) { CheckRoundtripFLBA(); } else { From 46aa3033bdfc1d143587ec43091c160d4dfdf24f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 17 Aug 2023 22:56:34 +0200 Subject: [PATCH 76/78] Review feedback --- cpp/src/parquet/encoding_test.cc | 68 +++++++++++++++++--------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 69b87f6469b..f1b1933193f 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1774,6 +1774,19 @@ std::shared_ptr DeltaEncode(std::vector lengths) { return encoder->FlushValues(); } +std::shared_ptr DeltaEncode(::arrow::util::span lengths) { + auto encoder = MakeTypedEncoder(Encoding::DELTA_BINARY_PACKED); + encoder->Put(lengths.data(), static_cast(lengths.size())); + return encoder->FlushValues(); +} + +std::shared_ptr DeltaEncode(std::shared_ptr<::arrow::Array>& lengths) { + auto data = ::arrow::internal::checked_pointer_cast(lengths); + auto span = ::arrow::util::span{data->raw_values(), + static_cast(lengths->length())}; + return DeltaEncode(span); +} + TEST(TestDeltaLengthByteArrayEncoding, AdHocRoundTrip) { const std::shared_ptr<::arrow::Array> cases[] = { ::arrow::ArrayFromJSON(::arrow::utf8(), R"([])"), @@ -1783,10 +1796,10 @@ TEST(TestDeltaLengthByteArrayEncoding, AdHocRoundTrip) { }; std::string expected_encoded_vals[] = { - DeltaEncode({})->ToString(), - DeltaEncode({3, 2, 0})->ToString() + "abcde", - DeltaEncode({0, 0, 0})->ToString(), - DeltaEncode({0, 3})->ToString() + "xyz", + DeltaEncode(std::vector({}))->ToString(), + DeltaEncode(std::vector({3, 2, 0}))->ToString() + "abcde", + DeltaEncode(std::vector({0, 0, 0}))->ToString(), + DeltaEncode(std::vector({0, 3}))->ToString() + "xyz", }; auto encoder = MakeTypedEncoder(Encoding::DELTA_LENGTH_BYTE_ARRAY, @@ -2154,15 +2167,6 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { ASSERT_TRUE(encoded->Equals(*buf)); }; - auto ArrayToInt32Vector = [](const std::shared_ptr<::arrow::Array>& lengths) { - std::vector vector; - auto data_ptr = checked_cast<::arrow::Int32Array*>(lengths.get()); - for (int i = 0; i < lengths->length(); ++i) { - vector.push_back(data_ptr->GetView(i)); - } - return vector; - }; - auto CheckDecode = [](std::shared_ptr buf, std::shared_ptr<::arrow::Array> values) { int num_values = static_cast(values->length()); @@ -2190,25 +2194,25 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { ::arrow::AssertArraysEqual(*values, *upcast_result); }; - auto CheckEncodeDecode = - [&](std::string_view values, std::shared_ptr<::arrow::Array> prefix_lengths, - std::shared_ptr<::arrow::Array> suffix_lengths, std::string_view suffix_data) { - auto encoded = - ::arrow::ConcatenateBuffers({DeltaEncode(ArrayToInt32Vector(prefix_lengths)), - DeltaEncode(ArrayToInt32Vector(suffix_lengths)), - std::make_shared(suffix_data)}) - .ValueOrDie(); - - CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), encoded); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), encoded); - CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), encoded); - CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), encoded); - - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::utf8(), values)); - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_utf8(), values)); - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::binary(), values)); - CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); - }; + auto CheckEncodeDecode = [&](std::string_view values, + std::shared_ptr<::arrow::Array> prefix_lengths, + std::shared_ptr<::arrow::Array> suffix_lengths, + std::string_view suffix_data) { + auto encoded = ::arrow::ConcatenateBuffers({DeltaEncode(prefix_lengths), + DeltaEncode(suffix_lengths), + std::make_shared(suffix_data)}) + .ValueOrDie(); + + CheckEncode(::arrow::ArrayFromJSON(::arrow::utf8(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_utf8(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::binary(), values), encoded); + CheckEncode(::arrow::ArrayFromJSON(::arrow::large_binary(), values), encoded); + + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::utf8(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_utf8(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::binary(), values)); + CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), values)); + }; { auto values = R"(["axis", "axle", "babble", "babyhood"])"; From acc40eda183f807b4870f10f7ea3aef994a9b193 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 21 Aug 2023 12:55:25 +0200 Subject: [PATCH 77/78] Review feedback --- cpp/src/parquet/encoding.cc | 60 ++++++++----------------------------- cpp/src/parquet/encoding.h | 3 +- 2 files changed, 14 insertions(+), 49 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 7d02ce1ed0f..2fa12d8bb62 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1180,15 +1180,15 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { return max_values; } -template -struct ArrowBinaryHelper; - template -struct ArrowBinaryHelper, void>> { +struct ArrowBinaryHelper { explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* acc) { - builder = acc->builder.get(); - chunks = &acc->chunks; + if constexpr (std::is_same_v) { + builder = acc->builder.get(); + chunks = &acc->chunks; + } else { + builder = acc; + } if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, builder->value_data_length(), &chunk_space_remaining))) { @@ -1217,7 +1217,11 @@ struct ArrowBinaryHelperAppend(data, length); + if constexpr (std::is_same_v) { + return builder->Append(data); + } else { + return builder->Append(data, length); + } } Status AppendNull() { return builder->AppendNull(); } @@ -1227,46 +1231,6 @@ struct ArrowBinaryHelper -struct ArrowBinaryHelper, void>> { - explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* acc) { - builder = acc; - if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, - builder->value_data_length(), - &space_remaining))) { - throw ParquetException("excess expansion in ArrowBinaryHelper"); - } - } - - Status PushChunk() { - std::shared_ptr<::arrow::Array> result; - RETURN_NOT_OK(builder->Finish(&result)); - space_remaining = ::arrow::kBinaryMemoryLimit; - return Status::OK(); - } - - bool CanFit(int64_t length) const { return length <= space_remaining; } - - void UnsafeAppend(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - space_remaining -= length; - builder->UnsafeAppend(data, length); - } - - void UnsafeAppendNull() { builder->UnsafeAppendNull(); } - - Status Append(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - space_remaining -= length; - return builder->Append(data); - } - - Status AppendNull() { return builder->AppendNull(); } - - typename EncodingTraits::Accumulator* builder; - int64_t space_remaining; -}; - template <> inline int PlainDecoder::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 5dc5d534674..300352bfe85 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -158,7 +158,8 @@ struct EncodingTraits { using Decoder = FLBADecoder; using ArrowType = ::arrow::FixedSizeBinaryType; - using Accumulator = ::arrow::FixedSizeBinaryBuilder; + using BuilderType = ::arrow::FixedSizeBinaryBuilder; + using Accumulator = BuilderType; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>; }; From 5039cf9ae7b15bc3fe8c7741f520c27c98076224 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 21 Aug 2023 17:37:35 +0200 Subject: [PATCH 78/78] Rewrite Binary/FLBA specializations to correctly account for Accumulator APIs --- cpp/src/parquet/encoding.cc | 183 ++++++++++++++++++++----------- cpp/src/parquet/encoding.h | 6 +- cpp/src/parquet/encoding_test.cc | 163 +++++++++++++-------------- 3 files changed, 202 insertions(+), 150 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 2fa12d8bb62..a3cef4b4ce8 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1180,55 +1180,126 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { return max_values; } +// A helper class to abstract away differences between EncodingTraits::Accumulator +// for ByteArrayType and FLBAType. template -struct ArrowBinaryHelper { - explicit ArrowBinaryHelper(typename EncodingTraits::Accumulator* acc) { - if constexpr (std::is_same_v) { - builder = acc->builder.get(); - chunks = &acc->chunks; - } else { - builder = acc; +struct ArrowBinaryHelper; + +template <> +struct ArrowBinaryHelper { + using Accumulator = typename EncodingTraits::Accumulator; + + ArrowBinaryHelper(Accumulator* acc, int64_t length) + : acc_(acc), + entries_remaining_(length), + chunk_space_remaining_(::arrow::kBinaryMemoryLimit - + acc_->builder->value_data_length()) {} + + Status Prepare(std::optional estimated_data_length = {}) { + RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); + if (estimated_data_length.has_value()) { + RETURN_NOT_OK(acc_->builder->ReserveData( + std::min(*estimated_data_length, ::arrow::kBinaryMemoryLimit))); } - if (ARROW_PREDICT_FALSE(SubtractWithOverflow(::arrow::kBinaryMemoryLimit, - builder->value_data_length(), - &chunk_space_remaining))) { - throw ParquetException("excess expansion in ArrowBinaryHelper"); + return Status::OK(); + } + + Status PrepareNextInput(int64_t next_value_length, + std::optional estimated_remaining_data_length = {}) { + if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { + // This element would exceed the capacity of a chunk + RETURN_NOT_OK(PushChunk()); + RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); + if (estimated_remaining_data_length.has_value()) { + RETURN_NOT_OK(acc_->builder->ReserveData( + std::min(*estimated_remaining_data_length, chunk_space_remaining_))); + } } + return Status::OK(); } + void UnsafeAppend(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); + DCHECK_GT(entries_remaining_, 0); + chunk_space_remaining_ -= length; + --entries_remaining_; + acc_->builder->UnsafeAppend(data, length); + } + + Status Append(const uint8_t* data, int32_t length) { + DCHECK(CanFit(length)); + DCHECK_GT(entries_remaining_, 0); + chunk_space_remaining_ -= length; + --entries_remaining_; + return acc_->builder->Append(data, length); + } + + void UnsafeAppendNull() { + --entries_remaining_; + acc_->builder->UnsafeAppendNull(); + } + + Status AppendNull() { + --entries_remaining_; + return acc_->builder->AppendNull(); + } + + private: Status PushChunk() { - std::shared_ptr<::arrow::Array> result; - RETURN_NOT_OK(builder->Finish(&result)); - chunks->push_back(result); - chunk_space_remaining = ::arrow::kBinaryMemoryLimit; + ARROW_ASSIGN_OR_RAISE(auto chunk, acc_->builder->Finish()); + acc_->chunks.push_back(std::move(chunk)); + chunk_space_remaining_ = ::arrow::kBinaryMemoryLimit; return Status::OK(); } - bool CanFit(int64_t length) const { return length <= chunk_space_remaining; } + bool CanFit(int64_t length) const { return length <= chunk_space_remaining_; } - void UnsafeAppend(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - chunk_space_remaining -= length; - builder->UnsafeAppend(data, length); + Accumulator* acc_; + int64_t entries_remaining_; + int64_t chunk_space_remaining_; +}; + +template <> +struct ArrowBinaryHelper { + using Accumulator = typename EncodingTraits::Accumulator; + + ArrowBinaryHelper(Accumulator* acc, int64_t length) + : acc_(acc), entries_remaining_(length) {} + + Status Prepare(std::optional estimated_data_length = {}) { + return acc_->Reserve(entries_remaining_); } - void UnsafeAppendNull() { builder->UnsafeAppendNull(); } + Status PrepareNextInput(int64_t next_value_length, + std::optional estimated_remaining_data_length = {}) { + return Status::OK(); + } + + void UnsafeAppend(const uint8_t* data, int32_t length) { + DCHECK_GT(entries_remaining_, 0); + --entries_remaining_; + acc_->UnsafeAppend(data); + } Status Append(const uint8_t* data, int32_t length) { - DCHECK(CanFit(length)); - chunk_space_remaining -= length; - if constexpr (std::is_same_v) { - return builder->Append(data); - } else { - return builder->Append(data, length); - } + DCHECK_GT(entries_remaining_, 0); + --entries_remaining_; + return acc_->Append(data); } - Status AppendNull() { return builder->AppendNull(); } + void UnsafeAppendNull() { + --entries_remaining_; + acc_->UnsafeAppendNull(); + } + + Status AppendNull() { + --entries_remaining_; + return acc_->AppendNull(); + } - typename EncodingTraits::BuilderType* builder; - std::vector>* chunks; - int64_t chunk_space_remaining; + private: + Accumulator* acc_; + int64_t entries_remaining_; }; template <> @@ -1329,12 +1400,10 @@ class PlainByteArrayDecoder : public PlainDecoder, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, int* out_values_decoded) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out, num_values); int values_decoded = 0; - RETURN_NOT_OK(helper.builder->Reserve(num_values)); - RETURN_NOT_OK(helper.builder->ReserveData( - std::min(len_, helper.chunk_space_remaining))); + RETURN_NOT_OK(helper.Prepare(len_)); int i = 0; RETURN_NOT_OK(VisitNullBitmapInline( @@ -1351,13 +1420,7 @@ class PlainByteArrayDecoder : public PlainDecoder, if (ARROW_PREDICT_FALSE(len_ < increment)) { ParquetException::EofException(); } - if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) { - // This element would exceed the capacity of a chunk - RETURN_NOT_OK(helper.PushChunk()); - RETURN_NOT_OK(helper.builder->Reserve(num_values - i)); - RETURN_NOT_OK(helper.builder->ReserveData( - std::min(len_, helper.chunk_space_remaining))); - } + RETURN_NOT_OK(helper.PrepareNextInput(value_len, len_)); helper.UnsafeAppend(data_ + 4, value_len); data_ += increment; len_ -= increment; @@ -1850,7 +1913,8 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, constexpr int32_t kBufferSize = 1024; int32_t indices[kBufferSize]; - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out, num_values); + RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; @@ -1871,9 +1935,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, const auto index = indices[pos_indices++]; RETURN_NOT_OK(IndexInBounds(index)); const auto& val = dict_values[index]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); - } + RETURN_NOT_OK(helper.PrepareNextInput(val.len)); RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); ++values_decoded; return Status::OK(); @@ -1919,20 +1981,21 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; int values_decoded = 0; - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out, num_values); + RETURN_NOT_OK(helper.Prepare(len_)); + auto dict_values = reinterpret_cast(dictionary_->data()); while (values_decoded < num_values) { - int32_t batch_size = std::min(kBufferSize, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices, batch_size); + const int32_t batch_size = + std::min(kBufferSize, num_values - values_decoded); + const int num_indices = idx_decoder_.GetBatch(indices, batch_size); if (num_indices == 0) ParquetException::EofException(); for (int i = 0; i < num_indices; ++i) { auto idx = indices[i]; RETURN_NOT_OK(IndexInBounds(idx)); const auto& val = dict_values[idx]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); - } + RETURN_NOT_OK(helper.PrepareNextInput(val.len)); RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); } values_decoded += num_indices; @@ -2762,7 +2825,8 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, int* out_num_values) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out, num_values); + RETURN_NOT_OK(helper.Prepare()); std::vector values(num_values - null_count); const int num_valid_values = Decode(values.data(), num_values - null_count); @@ -2778,9 +2842,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, valid_bits, valid_bits_offset, num_values, null_count, [&]() { const auto& val = values_ptr[value_idx]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); - } + RETURN_NOT_OK(helper.PrepareNextInput(val.len)); RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); ++value_idx; return Status::OK(); @@ -3334,7 +3396,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, int* out_num_values) { - ArrowBinaryHelper helper(out); + ArrowBinaryHelper helper(out, num_values); + RETURN_NOT_OK(helper.Prepare()); std::vector values(num_values); const int num_valid_values = GetInternal(values.data(), num_values - null_count); @@ -3347,9 +3410,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode valid_bits, valid_bits_offset, num_values, null_count, [&]() { const auto& val = values_ptr[value_idx]; - if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { - RETURN_NOT_OK(helper.PushChunk()); - } + RETURN_NOT_OK(helper.PrepareNextInput(val.len)); RETURN_NOT_OK(helper.Append(val.ptr, static_cast(val.len))); ++value_idx; return Status::OK(); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 300352bfe85..6cdfe379202 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -140,15 +140,14 @@ template <> struct EncodingTraits { using Encoder = ByteArrayEncoder; using Decoder = ByteArrayDecoder; - using BuilderType = ::arrow::BinaryBuilder; + using ArrowType = ::arrow::BinaryType; /// \brief Internal helper class for decoding BYTE_ARRAY data where we can /// overflow the capacity of a single arrow::BinaryArray struct Accumulator { std::unique_ptr<::arrow::BinaryBuilder> builder; std::vector> chunks; }; - using ArrowType = ::arrow::BinaryType; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; }; @@ -158,8 +157,7 @@ struct EncodingTraits { using Decoder = FLBADecoder; using ArrowType = ::arrow::FixedSizeBinaryType; - using BuilderType = ::arrow::FixedSizeBinaryBuilder; - using Accumulator = BuilderType; + using Accumulator = ::arrow::FixedSizeBinaryBuilder; using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>; }; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index f1b1933193f..71dc40d33ac 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2042,9 +2042,10 @@ TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) { } template -class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { +class TestDeltaByteArrayEncodingDirectPut : public TestEncodingBase { using ArrowType = typename EncodingTraits::ArrowType; - using IsFixedSizeBinary = ::arrow::is_fixed_size_binary_type; + using Accumulator = typename EncodingTraits::Accumulator; + using BuilderType = typename ::arrow::TypeTraits::BuilderType; public: std::unique_ptr> encoder = @@ -2052,109 +2053,101 @@ class DeltaByteArrayEncodingDirectPut : public TestEncodingBase { std::unique_ptr> decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); - void CheckDirectPutByteArray(std::shared_ptr<::arrow::Array> array) { - ASSERT_NO_THROW(encoder->Put(*array)); - auto buf = encoder->FlushValues(); + void CheckDirectPut(std::shared_ptr<::arrow::Array> array); - int num_values = static_cast(array->length() - array->null_count()); - decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + void CheckRoundtrip() override; - typename EncodingTraits::Accumulator acc; - using BuilderType = typename EncodingTraits::BuilderType; - acc.builder = std::make_unique(array->type(), default_memory_pool()); + protected: + USING_BASE_MEMBERS(); +}; - ASSERT_EQ(num_values, - decoder->DecodeArrow(static_cast(array->length()), - static_cast(array->null_count()), - array->null_bitmap_data(), array->offset(), &acc)); +template <> +void TestDeltaByteArrayEncodingDirectPut::CheckDirectPut( + std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); + auto buf = encoder->FlushValues(); - std::shared_ptr<::arrow::Array> result; - ASSERT_OK(acc.builder->Finish(&result)); - ASSERT_EQ(array->length(), result->length()); - ASSERT_OK(result->ValidateFull()); + int num_values = static_cast(array->length() - array->null_count()); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); - ::arrow::AssertArraysEqual(*array, *result); - } + Accumulator acc; + acc.builder = std::make_unique(array->type(), default_memory_pool()); - void CheckDirectPutFLBA(std::shared_ptr<::arrow::Array> array) { - ASSERT_NO_THROW(encoder->Put(*array)); - auto buf = encoder->FlushValues(); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast(array->length()), + static_cast(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); - int num_values = static_cast(array->length() - array->null_count()); - decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + ASSERT_EQ(acc.chunks.size(), 0) << "Accumulator shouldn't have overflowed chunks"; + ASSERT_OK_AND_ASSIGN(auto result, acc.builder->Finish()); + ASSERT_EQ(array->length(), result->length()); + ASSERT_OK(result->ValidateFull()); - auto acc = - typename EncodingTraits::Accumulator(array->type(), default_memory_pool()); - ASSERT_EQ(num_values, - decoder->DecodeArrow(static_cast(array->length()), - static_cast(array->null_count()), - array->null_bitmap_data(), array->offset(), &acc)); + ::arrow::AssertArraysEqual(*array, *result); +} - std::shared_ptr<::arrow::Array> result; - ASSERT_OK(acc.Finish(&result)); - ASSERT_EQ(array->length(), result->length()); - ASSERT_OK(result->ValidateFull()); +template <> +void TestDeltaByteArrayEncodingDirectPut::CheckDirectPut( + std::shared_ptr<::arrow::Array> array) { + ASSERT_NO_THROW(encoder->Put(*array)); + auto buf = encoder->FlushValues(); - ::arrow::AssertArraysEqual(*array, *result); - } + int num_values = static_cast(array->length() - array->null_count()); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); - void CheckDirectPut(std::shared_ptr<::arrow::Array> array) { - if constexpr (IsFixedSizeBinary::value) { - CheckDirectPutFLBA(array); - } else { - CheckDirectPutByteArray(array); - } - } + Accumulator acc(array->type(), default_memory_pool()); - void CheckRoundtripFLBA() { - constexpr int64_t kSize = 50; - constexpr int kSeed = 42; - constexpr int kByteWidth = 4; - ::arrow::random::RandomArrayGenerator rag{kSeed}; - std::shared_ptr<::arrow::Array> values = - rag.FixedSizeBinary(/*size=*/0, /*byte_width=*/kByteWidth); - CheckDirectPut(values); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast(array->length()), + static_cast(array->null_count()), + array->null_bitmap_data(), array->offset(), &acc)); - for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { - values = rag.FixedSizeBinary(kSize + seed, kByteWidth); - CheckDirectPut(values); - } - } + ASSERT_OK_AND_ASSIGN(auto result, acc.Finish()); + ASSERT_EQ(array->length(), result->length()); + ASSERT_OK(result->ValidateFull()); - void CheckRoundtripByteArray() { - constexpr int64_t kSize = 500; - constexpr int32_t kMinLength = 0; - constexpr int32_t kMaxLength = 10; - constexpr int32_t kNumUnique = 10; - constexpr double kNullProbability = 0.25; - constexpr int kSeed = 42; - ::arrow::random::RandomArrayGenerator rag{kSeed}; - std::shared_ptr<::arrow::Array> values = rag.BinaryWithRepeats( - /*size=*/1, /*unique=*/1, kMinLength, kMaxLength, kNullProbability); - CheckDirectPut(values); + ::arrow::AssertArraysEqual(*array, *result); +} - for (int i = 0; i < 10; ++i) { - values = rag.BinaryWithRepeats(kSize, kNumUnique, kMinLength, kMaxLength, - kNullProbability); - CheckDirectPut(values); - } +template <> +void TestDeltaByteArrayEncodingDirectPut::CheckRoundtrip() { + constexpr int64_t kSize = 500; + constexpr int32_t kMinLength = 0; + constexpr int32_t kMaxLength = 10; + constexpr int32_t kNumUnique = 10; + constexpr double kNullProbability = 0.25; + constexpr int kSeed = 42; + ::arrow::random::RandomArrayGenerator rag{kSeed}; + std::shared_ptr<::arrow::Array> values = rag.BinaryWithRepeats( + /*size=*/1, /*unique=*/1, kMinLength, kMaxLength, kNullProbability); + CheckDirectPut(values); + + for (int i = 0; i < 10; ++i) { + values = rag.BinaryWithRepeats(kSize, kNumUnique, kMinLength, kMaxLength, + kNullProbability); + CheckDirectPut(values); } +} - void CheckRoundtrip() override { - if constexpr (IsFixedSizeBinary::value) { - CheckRoundtripFLBA(); - } else { - CheckRoundtripByteArray(); - } - } +template <> +void TestDeltaByteArrayEncodingDirectPut::CheckRoundtrip() { + constexpr int64_t kSize = 50; + constexpr int kSeed = 42; + constexpr int kByteWidth = 4; + ::arrow::random::RandomArrayGenerator rag{kSeed}; + std::shared_ptr<::arrow::Array> values = + rag.FixedSizeBinary(/*size=*/0, /*byte_width=*/kByteWidth); + CheckDirectPut(values); - protected: - USING_BASE_MEMBERS(); -}; + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + values = rag.FixedSizeBinary(kSize + seed, kByteWidth); + CheckDirectPut(values); + } +} -TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes); +TYPED_TEST_SUITE(TestDeltaByteArrayEncodingDirectPut, TestDeltaByteArrayEncodingTypes); -TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) { +TYPED_TEST(TestDeltaByteArrayEncodingDirectPut, DirectPut) { ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip()); }