From 5df5edecf956c40079871620d3a31fd71f8785d0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 27 Dec 2022 13:43:25 -0800 Subject: [PATCH 1/4] feat(bench): add benchmark for reading strings from Parquet. --- .../parquet/arrow/reader_writer_benchmark.cc | 67 +++++++++++++++---- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 6445bb02758..a9c169f8344 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -88,6 +88,11 @@ struct benchmark_traits { using arrow_type = ::arrow::BooleanType; }; +template <> +struct benchmark_traits { + using arrow_type = ::arrow::BinaryType; +}; + template using ArrowType = typename benchmark_traits::arrow_type; @@ -208,7 +213,7 @@ struct Examples { }; static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table, - int64_t num_values = -1, int64_t bytes_per_value = -1) { + int64_t num_values = -1, int64_t total_bytes = -1) { auto output = CreateOutputStream(); EXIT_NOT_OK( WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows())); @@ -228,20 +233,20 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& num_values = table.num_rows(); } state.SetItemsProcessed(num_values * state.iterations()); - if (bytes_per_value != -1) { - state.SetBytesProcessed(num_values * state.iterations() * bytes_per_value); + if (total_bytes != -1) { + state.SetBytesProcessed(total_bytes * state.iterations()); } } static void BenchmarkReadArray(::benchmark::State& state, const std::shared_ptr& array, bool nullable, - int64_t num_values = -1, int64_t bytes_per_value = -1) { + int64_t num_values = -1, int64_t total_bytes = -1) { auto schema = ::arrow::schema({field("s", array->type(), nullable)}); auto table = ::arrow::Table::Make(schema, {array}, array->length()); EXIT_NOT_OK(table->Validate()); - BenchmarkReadTable(state, *table, num_values, bytes_per_value); + BenchmarkReadTable(state, *table, num_values, total_bytes); } // @@ -259,7 +264,7 @@ static void BM_ReadColumn(::benchmark::State& state) { TableFromVector(values, nullable, state.range(0)); BenchmarkReadTable(state, *table, table->num_rows(), - sizeof(typename ParquetType::c_type)); + sizeof(typename ParquetType::c_type) * table->num_rows()); } // There are two parameters here that cover different data distributions. @@ -319,6 +324,44 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) ->Args({kAlternatingOrNa, 1}) ->Args({5, 10}); +// +// Benchmark reading binary column +// + +int32_t kInfiniteUniqueValues = -1; +template +static void BM_ReadBinaryColumn(::benchmark::State& state) { + std::shared_ptr<::arrow::DataType> type = ::arrow::utf8(); + std::shared_ptr<::arrow::Array> arr; + ::arrow::random::RandomArrayGenerator generator(500); + double null_percentage = static_cast(state.range(0)) / 100.0; + if (state.range(1) == kInfiniteUniqueValues) { + arr = generator.String(BENCHMARK_SIZE, /*min_length=*/3, /*max_length=*/32, + /*null_probability=*/null_percentage); + } else { + arr = generator.StringWithRepeats(BENCHMARK_SIZE, /*unique=*/state.range(1), + /*min_length=*/3, /*max_length=*/32, + /*null_probability=*/null_percentage); + } + + std::shared_ptr<::arrow::Table> table = ::arrow::Table::Make( + ::arrow::schema({::arrow::field("column", type, nullable)}), {arr}); + + BenchmarkReadTable(state, *table, table->num_rows(), arr->data()->buffers[1]->size()); +} + +BENCHMARK_TEMPLATE1(BM_ReadBinaryColumn, false) + ->Args({/*null_probability*/ 0, /*unique_values*/ 2}) + ->Args({/*null_probability*/ 0, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}); +BENCHMARK_TEMPLATE1(BM_ReadBinaryColumn, true) + ->Args({/*null_probability*/ 1, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 50, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 99, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 1, /*unique_values*/ kInfiniteUniqueValues}) + ->Args({/*null_probability*/ 50, /*unique_values*/ kInfiniteUniqueValues}) + ->Args({/*null_probability*/ 99, /*unique_values*/ kInfiniteUniqueValues}); + // // Benchmark reading a nested column // @@ -383,7 +426,7 @@ static void BM_ReadStructColumn(::benchmark::State& state) { ::arrow::random::RandomArrayGenerator rng(42); auto array = MakeStructArray(&rng, kNumValues, null_probability); - BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues); } BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments); @@ -402,7 +445,7 @@ static void BM_ReadStructOfStructColumn(::benchmark::State& state) { auto values2 = MakeStructArray(&rng, kNumValues, null_probability); auto array = MakeStructArray(&rng, {values1, values2}, null_probability); - BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues); } BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments); @@ -426,7 +469,7 @@ static void BM_ReadStructOfListColumn(::benchmark::State& state) { auto array = MakeStructArray(&rng, {list1, list2}, null_probability, /*propagate_validity =*/true); - BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues); } BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments); @@ -445,7 +488,7 @@ static void BM_ReadListColumn(::benchmark::State& state) { auto array = rng.List(*values, kNumValues / 10, null_probability); - BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues); } BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments); @@ -464,7 +507,7 @@ static void BM_ReadListOfStructColumn(::benchmark::State& state) { auto array = rng.List(*values, kNumValues / 10, null_probability); - BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues); } BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments); @@ -484,7 +527,7 @@ static void BM_ReadListOfListColumn(::benchmark::State& state) { auto inner = rng.List(*values, kNumValues / 10, null_probability); auto array = rng.List(*inner, kNumValues / 100, null_probability); - BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues); } BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments); From 07ec9432a09c6c9870ff201b1c74858da1b16097 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 27 Dec 2022 16:37:00 -0800 Subject: [PATCH 2/4] feat(bench): add write benchmark --- .../parquet/arrow/reader_writer_benchmark.cc | 69 ++++++++++++++----- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index a9c169f8344..c1bb5f1ebda 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -202,6 +202,54 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType); BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType); BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType); +int32_t kInfiniteUniqueValues = -1; + +std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique_values, + int64_t null_percentage) { + std::shared_ptr<::arrow::DataType> type = ::arrow::utf8(); + std::shared_ptr<::arrow::Array> arr; + ::arrow::random::RandomArrayGenerator generator(500); + double null_probability = static_cast(null_percentage) / 100.0; + if (unique_values == kInfiniteUniqueValues) { + arr = generator.String(length, /*min_length=*/3, /*max_length=*/32, + /*null_probability=*/null_probability); + } else { + arr = generator.StringWithRepeats(length, /*unique=*/unique_values, + /*min_length=*/3, /*max_length=*/32, + /*null_probability=*/null_probability); + } + return ::arrow::Table::Make( + ::arrow::schema({::arrow::field("column", type, null_percentage > 0)}), {arr}); +} + +template +static void BM_WriteBinaryColumn(::benchmark::State& state) { + std::shared_ptr<::arrow::Table> table = + RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0)); + + while (state.KeepRunning()) { + auto output = CreateOutputStream(); + EXIT_NOT_OK( + WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE)); + } + + int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size(); + state.SetItemsProcessed(BENCHMARK_SIZE * state.iterations()); + state.SetBytesProcessed(total_bytes * state.iterations()); +} + +BENCHMARK_TEMPLATE1(BM_WriteBinaryColumn, false) + ->Args({/*null_probability*/ 0, /*unique_values*/ 2}) + ->Args({/*null_probability*/ 0, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}); +BENCHMARK_TEMPLATE1(BM_WriteBinaryColumn, true) + ->Args({/*null_probability*/ 1, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 50, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 99, /*unique_values*/ 32}) + ->Args({/*null_probability*/ 1, /*unique_values*/ kInfiniteUniqueValues}) + ->Args({/*null_probability*/ 50, /*unique_values*/ kInfiniteUniqueValues}) + ->Args({/*null_probability*/ 99, /*unique_values*/ kInfiniteUniqueValues}); + template struct Examples { static constexpr std::array values() { return {127, 128}; } @@ -328,26 +376,13 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) // Benchmark reading binary column // -int32_t kInfiniteUniqueValues = -1; template static void BM_ReadBinaryColumn(::benchmark::State& state) { - std::shared_ptr<::arrow::DataType> type = ::arrow::utf8(); - std::shared_ptr<::arrow::Array> arr; - ::arrow::random::RandomArrayGenerator generator(500); - double null_percentage = static_cast(state.range(0)) / 100.0; - if (state.range(1) == kInfiniteUniqueValues) { - arr = generator.String(BENCHMARK_SIZE, /*min_length=*/3, /*max_length=*/32, - /*null_probability=*/null_percentage); - } else { - arr = generator.StringWithRepeats(BENCHMARK_SIZE, /*unique=*/state.range(1), - /*min_length=*/3, /*max_length=*/32, - /*null_probability=*/null_percentage); - } - - std::shared_ptr<::arrow::Table> table = ::arrow::Table::Make( - ::arrow::schema({::arrow::field("column", type, nullable)}), {arr}); + std::shared_ptr<::arrow::Table> table = + RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0)); - BenchmarkReadTable(state, *table, table->num_rows(), arr->data()->buffers[1]->size()); + int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size(); + BenchmarkReadTable(state, *table, table->num_rows(), total_bytes); } BENCHMARK_TEMPLATE1(BM_ReadBinaryColumn, false) From 3c02495fae26d5911e08d43224c062f3457a615c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 27 Dec 2022 16:40:21 -0800 Subject: [PATCH 3/4] cleanup --- .../parquet/arrow/reader_writer_benchmark.cc | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index c1bb5f1ebda..fcc76d5fe2e 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -88,11 +88,6 @@ struct benchmark_traits { using arrow_type = ::arrow::BooleanType; }; -template <> -struct benchmark_traits { - using arrow_type = ::arrow::BinaryType; -}; - template using ArrowType = typename benchmark_traits::arrow_type; @@ -222,7 +217,6 @@ std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique ::arrow::schema({::arrow::field("column", type, null_percentage > 0)}), {arr}); } -template static void BM_WriteBinaryColumn(::benchmark::State& state) { std::shared_ptr<::arrow::Table> table = RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0)); @@ -238,11 +232,10 @@ static void BM_WriteBinaryColumn(::benchmark::State& state) { state.SetBytesProcessed(total_bytes * state.iterations()); } -BENCHMARK_TEMPLATE1(BM_WriteBinaryColumn, false) +BENCHMARK(BM_WriteBinaryColumn) ->Args({/*null_probability*/ 0, /*unique_values*/ 2}) ->Args({/*null_probability*/ 0, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}); -BENCHMARK_TEMPLATE1(BM_WriteBinaryColumn, true) + ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}) ->Args({/*null_probability*/ 1, /*unique_values*/ 32}) ->Args({/*null_probability*/ 50, /*unique_values*/ 32}) ->Args({/*null_probability*/ 99, /*unique_values*/ 32}) @@ -376,7 +369,6 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) // Benchmark reading binary column // -template static void BM_ReadBinaryColumn(::benchmark::State& state) { std::shared_ptr<::arrow::Table> table = RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0)); @@ -385,11 +377,10 @@ static void BM_ReadBinaryColumn(::benchmark::State& state) { BenchmarkReadTable(state, *table, table->num_rows(), total_bytes); } -BENCHMARK_TEMPLATE1(BM_ReadBinaryColumn, false) +BENCHMARK(BM_ReadBinaryColumn) ->Args({/*null_probability*/ 0, /*unique_values*/ 2}) ->Args({/*null_probability*/ 0, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}); -BENCHMARK_TEMPLATE1(BM_ReadBinaryColumn, true) + ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}) ->Args({/*null_probability*/ 1, /*unique_values*/ 32}) ->Args({/*null_probability*/ 50, /*unique_values*/ 32}) ->Args({/*null_probability*/ 99, /*unique_values*/ 32}) From 4a05c5a8f3f0b4a0781a8bf6de30535091c7b209 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 3 Jan 2023 13:32:57 -0800 Subject: [PATCH 4/4] pr feedback --- .../parquet/arrow/reader_writer_benchmark.cc | 50 +++++++++++-------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index fcc76d5fe2e..95c4a659297 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -203,7 +203,7 @@ std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique int64_t null_percentage) { std::shared_ptr<::arrow::DataType> type = ::arrow::utf8(); std::shared_ptr<::arrow::Array> arr; - ::arrow::random::RandomArrayGenerator generator(500); + ::arrow::random::RandomArrayGenerator generator(/*seed=*/500); double null_probability = static_cast(null_percentage) / 100.0; if (unique_values == kInfiniteUniqueValues) { arr = generator.String(length, /*min_length=*/3, /*max_length=*/32, @@ -227,21 +227,25 @@ static void BM_WriteBinaryColumn(::benchmark::State& state) { WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE)); } - int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size(); + // Offsets + data + int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size() + + table->column(0)->chunk(0)->data()->buffers[2]->size(); state.SetItemsProcessed(BENCHMARK_SIZE * state.iterations()); state.SetBytesProcessed(total_bytes * state.iterations()); } BENCHMARK(BM_WriteBinaryColumn) - ->Args({/*null_probability*/ 0, /*unique_values*/ 2}) - ->Args({/*null_probability*/ 0, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}) - ->Args({/*null_probability*/ 1, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 50, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 99, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 1, /*unique_values*/ kInfiniteUniqueValues}) - ->Args({/*null_probability*/ 50, /*unique_values*/ kInfiniteUniqueValues}) - ->Args({/*null_probability*/ 99, /*unique_values*/ kInfiniteUniqueValues}); + ->ArgNames({"null_probability", "unique_values"}) + // We vary unique values to trigger the dictionary-encoded (for low-cardinality) + // and plain (for high-cardinality) code paths. + ->Args({0, 32}) + ->Args({0, kInfiniteUniqueValues}) + ->Args({1, 32}) + ->Args({50, 32}) + ->Args({99, 32}) + ->Args({1, kInfiniteUniqueValues}) + ->Args({50, kInfiniteUniqueValues}) + ->Args({99, kInfiniteUniqueValues}); template struct Examples { @@ -373,20 +377,24 @@ static void BM_ReadBinaryColumn(::benchmark::State& state) { std::shared_ptr<::arrow::Table> table = RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0)); - int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size(); + // Offsets + data + int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size() + + table->column(0)->chunk(0)->data()->buffers[2]->size(); BenchmarkReadTable(state, *table, table->num_rows(), total_bytes); } BENCHMARK(BM_ReadBinaryColumn) - ->Args({/*null_probability*/ 0, /*unique_values*/ 2}) - ->Args({/*null_probability*/ 0, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 0, /*unique_values*/ kInfiniteUniqueValues}) - ->Args({/*null_probability*/ 1, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 50, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 99, /*unique_values*/ 32}) - ->Args({/*null_probability*/ 1, /*unique_values*/ kInfiniteUniqueValues}) - ->Args({/*null_probability*/ 50, /*unique_values*/ kInfiniteUniqueValues}) - ->Args({/*null_probability*/ 99, /*unique_values*/ kInfiniteUniqueValues}); + ->ArgNames({"null_probability", "unique_values"}) + // We vary unique values to trigger the dictionary-encoded (for low-cardinality) + // and plain (for high-cardinality) code paths. + ->Args({0, 32}) + ->Args({0, kInfiniteUniqueValues}) + ->Args({1, 32}) + ->Args({50, 32}) + ->Args({99, 32}) + ->Args({1, kInfiniteUniqueValues}) + ->Args({50, kInfiniteUniqueValues}) + ->Args({99, kInfiniteUniqueValues}); // // Benchmark reading a nested column