From c99d49d3d19e8de0486755da71de9cfd4717fa86 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 9 Sep 2023 17:02:08 +0800 Subject: [PATCH 1/5] add benchmark for delta --- cpp/src/parquet/encoding.cc | 28 +++++---- cpp/src/parquet/encoding_benchmark.cc | 84 +++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e3c8ab196f4..f0038e6f79b 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3300,14 +3300,19 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode void SetData(int num_values, const uint8_t* data, int len) override { num_values_ = num_values; - decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + if (decoder_) { + decoder_->Reset(data, len); + } else { + decoder_ = std::make_shared<::arrow::bit_util::BitReader>(data, len); + } prefix_len_decoder_.SetDecoder(num_values, decoder_); // get the number of encoded prefix lengths int num_prefix = prefix_len_decoder_.ValidValuesCount(); // call prefix_len_decoder_.Decode to decode all the prefix lengths. // all the prefix lengths are buffered in buffered_prefix_length_. - PARQUET_THROW_NOT_OK(buffered_prefix_length_->Resize(num_prefix * sizeof(int32_t))); + PARQUET_THROW_NOT_OK(buffered_prefix_length_->Resize(num_prefix * sizeof(int32_t), + /*shrink_to_fit=*/false)); int ret = prefix_len_decoder_.Decode( reinterpret_cast(buffered_prefix_length_->mutable_data()), num_prefix); DCHECK_EQ(ret, num_prefix); @@ -3323,7 +3328,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode // TODO: read corrupted files written with bug(PARQUET-246). last_value_ should be set // to last_value_in_previous_page_ when decoding a new page(except the first page) - last_value_ = ""; + last_value_.clear(); } int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, @@ -3370,7 +3375,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode throw ParquetException("excess expansion in DELTA_BYTE_ARRAY"); } } - PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size)); + PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size, false)); string_view prefix{last_value_}; uint8_t* data_ptr = buffered_data_->mutable_data(); @@ -3378,12 +3383,15 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode if (ARROW_PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { throw ParquetException("prefix length too large in DELTA_BYTE_ARRAY"); } - memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); - // buffer[i] currently points to the string suffix - memcpy(data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); - buffer[i].ptr = data_ptr; - buffer[i].len += prefix_len_ptr[i]; - data_ptr += buffer[i].len; + // If the prefix length is zero, the prefix can be ignored. + if (prefix_len_ptr[i] != 0) { + memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); + // buffer[i] currently points to the string suffix + memcpy(data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); + buffer[i].ptr = data_ptr; + buffer[i].len += prefix_len_ptr[i]; + data_ptr += buffer[i].len; + } prefix = std::string_view{buffer[i]}; } prefix_len_offset_ += max_values; diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 6726810911f..a643e3fb2bb 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -737,6 +737,90 @@ static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) { BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); +void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, + int min_size, int max_size, double prefixed_probability) { + std::default_random_engine gen(seed); + std::uniform_int_distribution dist_size(min_size, max_size); + std::uniform_int_distribution dist_byte(0, 255); + std::bernoulli_distribution dist_has_prefix(prefixed_probability); + std::uniform_real_distribution dist_prefix_length(0, 1); + + for (int i = 0; i < n; ++i) { + int len = dist_size(gen); + out[i].len = len; + out[i].ptr = buf; + + bool do_prefix = dist_has_prefix(gen) && i > 0; + int prefix_len = 0; + if (do_prefix) { + int max_prefix_len = std::min(len, static_cast(out[i - 1].len)); + prefix_len = static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); + } + for (int j = 0; j < prefix_len; ++j) { + buf[j] = out[i - 1].ptr[j]; + } + for (int j = prefix_len; j < len; ++j) { + buf[j] = static_cast(dist_byte(gen)); + } + buf += len; + } +} + +static void BM_DeltaEncodingByteArray(benchmark::State& state) { + // Using arrow generator to generate random data. + int32_t max_length = static_cast(state.range(0)); + int32_t array_size = static_cast(state.range(1)); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + std::vector values; + std::vector buf(max_length * array_size); + values.resize(array_size); + prefixed_random_byte_array(array_size, /*seed=*/0, buf.data(), values.data(), + /*min_size=*/0, max_length, + /*prefixed_probability=*/0.5); + int64_t actual_length = 0; + for (auto v : values) { + actual_length += v.len; + } + + for (auto _ : state) { + encoder->Put(values.data(), static_cast(values.size())); + encoder->FlushValues(); + } + state.SetItemsProcessed(state.iterations() * array_size); + state.SetBytesProcessed(state.iterations() * actual_length); +} + +static void BM_DeltaDecodingByteArray(benchmark::State& state) { + // Using arrow generator to generate random data. + int32_t max_length = static_cast(state.range(0)); + int32_t array_size = static_cast(state.range(1)); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); + std::vector values; + std::vector input_buf(max_length * array_size); + values.resize(array_size); + prefixed_random_byte_array(array_size, /*seed=*/0, input_buf.data(), values.data(), + /*min_size=*/0, max_length, + /*prefixed_probability=*/0.5); + int64_t actual_length = 0; + for (auto v : values) { + actual_length += v.len; + } + encoder->Put(values.data(), static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); + + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); + for (auto _ : state) { + decoder->SetData(array_size, buf->data(), static_cast(buf->size())); + decoder->Decode(values.data(), static_cast(values.size())); + ::benchmark::DoNotOptimize(values); + } + state.SetItemsProcessed(state.iterations() * array_size); + state.SetBytesProcessed(state.iterations() * actual_length); +} + +BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayCustomArguments); +BENCHMARK(BM_DeltaDecodingByteArray)->Apply(ByteArrayCustomArguments); + static void BM_RleEncodingBoolean(benchmark::State& state) { std::vector values(state.range(0), true); auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::RLE); From d9825d2ce56b234f6fbc93ea780bb9e59451d1da Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 15 Sep 2023 18:53:50 +0800 Subject: [PATCH 2/5] fix comment --- cpp/src/parquet/encoding.cc | 3 ++- cpp/src/parquet/encoding_benchmark.cc | 15 +++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f0038e6f79b..6f2f9658233 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3375,7 +3375,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode throw ParquetException("excess expansion in DELTA_BYTE_ARRAY"); } } - PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size, false)); + // TODO(mwish): Release the buffer if it is too large. + PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size, /*shrink_to_fit=*/false)); string_view prefix{last_value_}; uint8_t* data_ptr = buffered_data_->mutable_data(); diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index a643e3fb2bb..306cd432132 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -770,13 +770,14 @@ static void BM_DeltaEncodingByteArray(benchmark::State& state) { // Using arrow generator to generate random data. int32_t max_length = static_cast(state.range(0)); int32_t array_size = static_cast(state.range(1)); + double prefixed_probability = state.range(2) / 100; auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); std::vector values; std::vector buf(max_length * array_size); values.resize(array_size); prefixed_random_byte_array(array_size, /*seed=*/0, buf.data(), values.data(), /*min_size=*/0, max_length, - /*prefixed_probability=*/0.5); + /*prefixed_probability=*/prefixed_probability); int64_t actual_length = 0; for (auto v : values) { actual_length += v.len; @@ -794,13 +795,14 @@ static void BM_DeltaDecodingByteArray(benchmark::State& state) { // Using arrow generator to generate random data. int32_t max_length = static_cast(state.range(0)); int32_t array_size = static_cast(state.range(1)); + double prefixed_probability = state.range(2) / 100; auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); std::vector values; std::vector input_buf(max_length * array_size); values.resize(array_size); prefixed_random_byte_array(array_size, /*seed=*/0, input_buf.data(), values.data(), /*min_size=*/0, max_length, - /*prefixed_probability=*/0.5); + /*prefixed_probability=*/prefixed_probability); int64_t actual_length = 0; for (auto v : values) { actual_length += v.len; @@ -818,8 +820,13 @@ static void BM_DeltaDecodingByteArray(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * actual_length); } -BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayCustomArguments); -BENCHMARK(BM_DeltaDecodingByteArray)->Apply(ByteArrayCustomArguments); +static void ByteArrayDeltaCustomArguments(benchmark::internal::Benchmark* b) { + b->ArgsProduct({{8, 64, 1024}, {512, 2048}, {10, 90, 99}}) + ->ArgNames({"max-string-length", "batch-size", "prefixed-probability"}); +} + +BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayDeltaCustomArguments); +BENCHMARK(BM_DeltaDecodingByteArray)->Apply(ByteArrayDeltaCustomArguments); static void BM_RleEncodingBoolean(benchmark::State& state) { std::vector values(state.range(0), true); From c567ed6e0adc6a22b87ffdcb24b6e37a05507726 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 15 Sep 2023 19:45:22 +0800 Subject: [PATCH 3/5] refactor test --- cpp/src/parquet/encoding_benchmark.cc | 34 +++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 306cd432132..fab95516488 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -767,16 +767,16 @@ void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* o } static void BM_DeltaEncodingByteArray(benchmark::State& state) { - // Using arrow generator to generate random data. - int32_t max_length = static_cast(state.range(0)); - int32_t array_size = static_cast(state.range(1)); - double prefixed_probability = state.range(2) / 100; + int32_t min_length = static_cast(state.range(0)); + int32_t max_length = static_cast(state.range(1)); + int32_t array_size = static_cast(state.range(2)); + double prefixed_probability = state.range(3) / 100; auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); std::vector values; std::vector buf(max_length * array_size); values.resize(array_size); prefixed_random_byte_array(array_size, /*seed=*/0, buf.data(), values.data(), - /*min_size=*/0, max_length, + min_length, max_length, /*prefixed_probability=*/prefixed_probability); int64_t actual_length = 0; for (auto v : values) { @@ -792,16 +792,16 @@ static void BM_DeltaEncodingByteArray(benchmark::State& state) { } static void BM_DeltaDecodingByteArray(benchmark::State& state) { - // Using arrow generator to generate random data. - int32_t max_length = static_cast(state.range(0)); - int32_t array_size = static_cast(state.range(1)); - double prefixed_probability = state.range(2) / 100; + int32_t min_length = static_cast(state.range(0)); + int32_t max_length = static_cast(state.range(1)); + int32_t array_size = static_cast(state.range(2)); + double prefixed_probability = state.range(3) / 100; auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); std::vector values; std::vector input_buf(max_length * array_size); values.resize(array_size); prefixed_random_byte_array(array_size, /*seed=*/0, input_buf.data(), values.data(), - /*min_size=*/0, max_length, + min_length, max_length, /*prefixed_probability=*/prefixed_probability); int64_t actual_length = 0; for (auto v : values) { @@ -821,8 +821,18 @@ static void BM_DeltaDecodingByteArray(benchmark::State& state) { } static void ByteArrayDeltaCustomArguments(benchmark::internal::Benchmark* b) { - b->ArgsProduct({{8, 64, 1024}, {512, 2048}, {10, 90, 99}}) - ->ArgNames({"max-string-length", "batch-size", "prefixed-probability"}); + for (int max_string_length : {8, 64, 1024}) { + for (int batch_size : {512, 2048}) { + std::vector> prefix_gen_params = { + {10, 0}, {90, max_string_length / 2}, {99, max_string_length}}; + for (auto& [prefixed_probability, min_prefix_string_length] : prefix_gen_params) { + b->Args({min_prefix_string_length, max_string_length, batch_size, + prefixed_probability}); + } + } + } + b->ArgNames({"min-prefix-string-length", "max-string-length", "batch-size", + "prefixed-probability"}); } BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayDeltaCustomArguments); From e8e8766ea52825b38476fa351d7f8a8e412f54b1 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 26 Sep 2023 21:14:38 +0800 Subject: [PATCH 4/5] resolve comment and revert optimize --- cpp/src/parquet/encoding.cc | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6f2f9658233..0564ea2b93f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3311,8 +3311,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode int num_prefix = prefix_len_decoder_.ValidValuesCount(); // call prefix_len_decoder_.Decode to decode all the prefix lengths. // all the prefix lengths are buffered in buffered_prefix_length_. - PARQUET_THROW_NOT_OK(buffered_prefix_length_->Resize(num_prefix * sizeof(int32_t), - /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK(buffered_prefix_length_->Resize(num_prefix * sizeof(int32_t))); int ret = prefix_len_decoder_.Decode( reinterpret_cast(buffered_prefix_length_->mutable_data()), num_prefix); DCHECK_EQ(ret, num_prefix); @@ -3375,8 +3374,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode throw ParquetException("excess expansion in DELTA_BYTE_ARRAY"); } } - // TODO(mwish): Release the buffer if it is too large. - PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size, /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size)); string_view prefix{last_value_}; uint8_t* data_ptr = buffered_data_->mutable_data(); @@ -3384,15 +3382,12 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode if (ARROW_PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { throw ParquetException("prefix length too large in DELTA_BYTE_ARRAY"); } - // If the prefix length is zero, the prefix can be ignored. - if (prefix_len_ptr[i] != 0) { - memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); - // buffer[i] currently points to the string suffix - memcpy(data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); - buffer[i].ptr = data_ptr; - buffer[i].len += prefix_len_ptr[i]; - data_ptr += buffer[i].len; - } + memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); + // buffer[i] currently points to the string suffix + memcpy(data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); + buffer[i].ptr = data_ptr; + buffer[i].len += prefix_len_ptr[i]; + data_ptr += buffer[i].len; prefix = std::string_view{buffer[i]}; } prefix_len_offset_ += max_values; From 557d0caa213a5e07057ed8422d0f768da35731d8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 26 Sep 2023 15:20:08 +0200 Subject: [PATCH 5/5] Simplify and improve benchmarks --- cpp/src/parquet/encoding_benchmark.cc | 141 ++++++++++++++------------ 1 file changed, 74 insertions(+), 67 deletions(-) diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index fab95516488..717c7163305 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -737,102 +737,109 @@ static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) { BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); -void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, - int min_size, int max_size, double prefixed_probability) { - std::default_random_engine gen(seed); - std::uniform_int_distribution dist_size(min_size, max_size); - std::uniform_int_distribution dist_byte(0, 255); - std::bernoulli_distribution dist_has_prefix(prefixed_probability); - std::uniform_real_distribution dist_prefix_length(0, 1); - - for (int i = 0; i < n; ++i) { - int len = dist_size(gen); - out[i].len = len; - out[i].ptr = buf; - - bool do_prefix = dist_has_prefix(gen) && i > 0; - int prefix_len = 0; - if (do_prefix) { - int max_prefix_len = std::min(len, static_cast(out[i - 1].len)); - prefix_len = static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); - } - for (int j = 0; j < prefix_len; ++j) { - buf[j] = out[i - 1].ptr[j]; - } - for (int j = prefix_len; j < len; ++j) { - buf[j] = static_cast(dist_byte(gen)); +struct DeltaByteArrayState { + int32_t min_size = 0; + int32_t max_size; + int32_t array_length; + int32_t total_data_size = 0; + double prefixed_probability; + std::vector buf; + + explicit DeltaByteArrayState(const benchmark::State& state) + : max_size(static_cast(state.range(0))), + array_length(static_cast(state.range(1))), + prefixed_probability(state.range(2) / 100.0) {} + + std::vector MakeRandomByteArray(uint32_t seed) { + std::default_random_engine gen(seed); + std::uniform_int_distribution dist_size(min_size, max_size); + std::uniform_int_distribution dist_byte(0, 255); + std::bernoulli_distribution dist_has_prefix(prefixed_probability); + std::uniform_real_distribution dist_prefix_length(0, 1); + + std::vector out(array_length); + buf.resize(max_size * array_length); + auto buf_ptr = buf.data(); + total_data_size = 0; + + for (int32_t i = 0; i < array_length; ++i) { + int len = dist_size(gen); + out[i].len = len; + out[i].ptr = buf_ptr; + + bool do_prefix = i > 0 && dist_has_prefix(gen); + int prefix_len = 0; + if (do_prefix) { + int max_prefix_len = std::min(len, static_cast(out[i - 1].len)); + prefix_len = + static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); + } + for (int j = 0; j < prefix_len; ++j) { + buf_ptr[j] = out[i - 1].ptr[j]; + } + for (int j = prefix_len; j < len; ++j) { + buf_ptr[j] = static_cast(dist_byte(gen)); + } + buf_ptr += len; + total_data_size += len; } - buf += len; + return out; } -} +}; static void BM_DeltaEncodingByteArray(benchmark::State& state) { - int32_t min_length = static_cast(state.range(0)); - int32_t max_length = static_cast(state.range(1)); - int32_t array_size = static_cast(state.range(2)); - double prefixed_probability = state.range(3) / 100; + DeltaByteArrayState delta_state(state); + std::vector values = delta_state.MakeRandomByteArray(/*seed=*/42); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); - std::vector values; - std::vector buf(max_length * array_size); - values.resize(array_size); - prefixed_random_byte_array(array_size, /*seed=*/0, buf.data(), values.data(), - min_length, max_length, - /*prefixed_probability=*/prefixed_probability); - int64_t actual_length = 0; - for (auto v : values) { - actual_length += v.len; - } + const int64_t plain_encoded_size = + delta_state.total_data_size + 4 * delta_state.array_length; + int64_t encoded_size = 0; for (auto _ : state) { encoder->Put(values.data(), static_cast(values.size())); - encoder->FlushValues(); + encoded_size = encoder->FlushValues()->size(); } - state.SetItemsProcessed(state.iterations() * array_size); - state.SetBytesProcessed(state.iterations() * actual_length); + state.SetItemsProcessed(state.iterations() * delta_state.array_length); + state.SetBytesProcessed(state.iterations() * delta_state.total_data_size); + state.counters["compression_ratio"] = + static_cast(plain_encoded_size) / encoded_size; } static void BM_DeltaDecodingByteArray(benchmark::State& state) { - int32_t min_length = static_cast(state.range(0)); - int32_t max_length = static_cast(state.range(1)); - int32_t array_size = static_cast(state.range(2)); - double prefixed_probability = state.range(3) / 100; + DeltaByteArrayState delta_state(state); + std::vector values = delta_state.MakeRandomByteArray(/*seed=*/42); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); - std::vector values; - std::vector input_buf(max_length * array_size); - values.resize(array_size); - prefixed_random_byte_array(array_size, /*seed=*/0, input_buf.data(), values.data(), - min_length, max_length, - /*prefixed_probability=*/prefixed_probability); - int64_t actual_length = 0; - for (auto v : values) { - actual_length += v.len; - } encoder->Put(values.data(), static_cast(values.size())); std::shared_ptr buf = encoder->FlushValues(); + const int64_t plain_encoded_size = + delta_state.total_data_size + 4 * delta_state.array_length; + const int64_t encoded_size = buf->size(); + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); for (auto _ : state) { - decoder->SetData(array_size, buf->data(), static_cast(buf->size())); + decoder->SetData(delta_state.array_length, buf->data(), + static_cast(buf->size())); decoder->Decode(values.data(), static_cast(values.size())); ::benchmark::DoNotOptimize(values); } - state.SetItemsProcessed(state.iterations() * array_size); - state.SetBytesProcessed(state.iterations() * actual_length); + state.SetItemsProcessed(state.iterations() * delta_state.array_length); + state.SetBytesProcessed(state.iterations() * delta_state.total_data_size); + state.counters["compression_ratio"] = + static_cast(plain_encoded_size) / encoded_size; } static void ByteArrayDeltaCustomArguments(benchmark::internal::Benchmark* b) { for (int max_string_length : {8, 64, 1024}) { for (int batch_size : {512, 2048}) { - std::vector> prefix_gen_params = { - {10, 0}, {90, max_string_length / 2}, {99, max_string_length}}; - for (auto& [prefixed_probability, min_prefix_string_length] : prefix_gen_params) { - b->Args({min_prefix_string_length, max_string_length, batch_size, - prefixed_probability}); + for (int prefixed_percent : {10, 90, 99}) { + b->Args({max_string_length, batch_size, prefixed_percent}); } } } - b->ArgNames({"min-prefix-string-length", "max-string-length", "batch-size", - "prefixed-probability"}); + b->ArgNames({"max-string-length", "batch-size", "prefixed-percent"}); } BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayDeltaCustomArguments);