From da0b1a84ba7ba00085e125ded7deb00eb5065bff Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 11 May 2020 04:06:19 +0000 Subject: [PATCH 1/6] expand performance test coverage --- .../parquet/arrow/reader_writer_benchmark.cc | 105 ++++++++++++++---- 1 file changed, 85 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 66bc9e9b46e..1fa7e052ee6 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -18,6 +18,7 @@ #include "benchmark/benchmark.h" #include +#include #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" @@ -28,6 +29,7 @@ #include "parquet/platform.h" #include "arrow/api.h" +#include "arrow/util/logging.h" using arrow::BooleanBuilder; using arrow::NumericBuilder; @@ -95,15 +97,29 @@ void SetBytesProcessed(::benchmark::State& state) { state.SetBytesProcessed(bytes_processed); } +constexpr int64_t kAlternatingOrNa = -1; + template std::shared_ptr<::arrow::Table> TableFromVector( - const std::vector& vec, bool nullable) { + const std::vector& vec, bool nullable, + int null_percentage = kAlternatingOrNa) { + if (!nullable) { + DCHECK(null_percentage = kAlternatingOrNa); + } std::shared_ptr<::arrow::DataType> type = std::make_shared>(); NumericBuilder> builder; if (nullable) { std::vector valid_bytes(BENCHMARK_SIZE, 0); - int n = {0}; - std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; }); + if (null_percentage == -1) { + int n = {0}; + std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; }); + } else { + std::default_random_engine rng(500); + double valid_probability = 1.0 - (static_cast(null_percentage) / 100); + std::bernoulli_distribution dist(valid_probability); + std::generate(valid_bytes.begin(), valid_bytes.end(), + [&] { return static_cast(dist(rng)); }); + } EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data())); } else { EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), nullptr)); @@ -118,13 +134,22 @@ std::shared_ptr<::arrow::Table> TableFromVector( template <> std::shared_ptr<::arrow::Table> TableFromVector(const std::vector& vec, - bool nullable) { + bool nullable, + int null_percentage) { BooleanBuilder builder; if (nullable) { std::vector valid_bytes(BENCHMARK_SIZE, 0); - int n = {0}; - std::generate(valid_bytes.begin(), valid_bytes.end(), - [&n] { return (n++ % 2) != 0; }); + if (null_percentage == -1) { + int n = {0}; + std::generate(valid_bytes.begin(), valid_bytes.end(), + [&n] { return n++ % 2 != 0; }); + } else { + std::default_random_engine rng(500); + double valid_probability = 1.0 - (static_cast(null_percentage) / 100); + std::bernoulli_distribution dist(valid_probability); + std::generate(valid_bytes.begin(), valid_bytes.end(), + [&] { return static_cast(dist(rng)); }); + } EXIT_NOT_OK(builder.AppendValues(vec, valid_bytes)); } else { EXIT_NOT_OK(builder.AppendValues(vec)); @@ -141,7 +166,7 @@ std::shared_ptr<::arrow::Table> TableFromVector(const std::vector static void BM_WriteColumn(::benchmark::State& state) { using T = typename ParquetType::c_type; - std::vector values(BENCHMARK_SIZE, static_cast(128)); + std::vector values(BENCHMARK_SIZE, 128); std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable); while (state.KeepRunning()) { @@ -169,7 +194,14 @@ static void BM_ReadColumn(::benchmark::State& state) { using T = typename ParquetType::c_type; std::vector values(BENCHMARK_SIZE, static_cast(128)); - std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable); + double value_probability = static_cast(state.range(1)) / 100.0; + std::default_random_engine rng(500); + std::bernoulli_distribution dist(value_probability); + std::generate(values.begin(), values.end(), + [&] { return static_cast(dist(rng) * 128); }); + + std::shared_ptr<::arrow::Table> table = + TableFromVector(values, nullable, state.range(0)); auto output = CreateOutputStream(); EXIT_NOT_OK(WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE)); @@ -187,17 +219,50 @@ static void BM_ReadColumn(::benchmark::State& state) { SetBytesProcessed(state); } -BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type); -BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type); - -BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type); -BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type); - -BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType); -BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType); - -BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType); -BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType); +BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type) + ->Args({/*null_percentage=*/kAlternatingOrNa, 1}) + ->Args({/*null_percentage=*/kAlternatingOrNa, 10}) + ->Args({/*null_percentage=*/kAlternatingOrNa, 50}); + +BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type) + ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) + ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10}) + ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50}) + ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25}) + ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50}) + ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100}) + ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50}) + ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100}); + +BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type) + ->Args({/*null_percentage=*/kAlternatingOrNa, 1}) + ->Args({/*null_percentage=*/kAlternatingOrNa, 10}) + ->Args({/*null_percentage=*/kAlternatingOrNa, 50}); +BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type) + ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) + ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10}) + ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50}) + ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25}) + ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50}) + ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100}) + ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50}) + ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100}); + +BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType) + ->Args({kAlternatingOrNa, 0}) + ->Args({1, 20}); +// Less coverage because int64_t should be pretty good representation for nullability and +// repeating values. +BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType) + ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) + ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50}) + ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25}); + +BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType) + ->Args({kAlternatingOrNa, 0}) + ->Args({1, 20}); +BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) + ->Ranges({{kAlternatingOrNa, 0}, {0, 10}}); static void BM_ReadIndividualRowGroups(::benchmark::State& state) { std::vector values(BENCHMARK_SIZE, 128); From 6c6295010e3556b61ce2390f46827c2a05a027e1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 14 May 2020 04:33:56 +0000 Subject: [PATCH 2/6] add more points to int64 --- cpp/src/parquet/arrow/reader_writer_benchmark.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 1fa7e052ee6..f0f5462b746 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -241,10 +241,15 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type) BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type) ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10}) + ->Args({/*null_percentage=*/5, /*first_value_percentage=*/10}) ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50}) ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25}) + ->Args({/*null_percentage=*/30, /*first_value_percentage=*/25}) + ->Args({/*null_percentage=*/35, /*first_value_percentage=*/25}) + ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25}) ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50}) - ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100}) + ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1}) + ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1}) ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50}) ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100}); From 0e6aa53d3c1f630f76eb14d390986b051e7835ec Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 14 May 2020 04:55:10 +0000 Subject: [PATCH 3/6] remove range for boolean --- cpp/src/parquet/arrow/reader_writer_benchmark.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index f0f5462b746..1be3057a92f 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -267,7 +267,8 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType) ->Args({kAlternatingOrNa, 0}) ->Args({1, 20}); BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) - ->Ranges({{kAlternatingOrNa, 0}, {0, 10}}); + ->Args({kAlternatingOrNa, 1}) + ->Args({5, 10}); static void BM_ReadIndividualRowGroups(::benchmark::State& state) { std::vector values(BENCHMARK_SIZE, 128); From fc009ec11b06e3fb70b4a7725560292b2b03a092 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 16 May 2020 06:30:06 +0000 Subject: [PATCH 4/6] address review comments --- .../parquet/arrow/reader_writer_benchmark.cc | 98 ++++++++++--------- 1 file changed, 54 insertions(+), 44 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 1be3057a92f..1328828018b 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -99,27 +99,35 @@ void SetBytesProcessed(::benchmark::State& state) { constexpr int64_t kAlternatingOrNa = -1; +template +std::vector RandomVector(int64_t true_percentage, int64_t vector_size, + const std::array& sample_values) { + std::vector values(BENCHMARK_SIZE, {}); + if (true_percentage == kAlternatingOrNa) { + int n = {0}; + std::generate(values.begin(), values.end(), [&n] { return n++ % 2; }); + } else { + std::default_random_engine rng(500); + double true_probability = static_cast(true_percentage) / 100.0; + std::bernoulli_distribution dist(true_probability); + std::generate(values.begin(), values.end(), [&] { return sample_values[dist(rng)]; }); + } + return values; +} + template std::shared_ptr<::arrow::Table> TableFromVector( const std::vector& vec, bool nullable, - int null_percentage = kAlternatingOrNa) { + int64_t null_percentage = kAlternatingOrNa) { if (!nullable) { DCHECK(null_percentage = kAlternatingOrNa); } std::shared_ptr<::arrow::DataType> type = std::make_shared>(); NumericBuilder> builder; if (nullable) { - std::vector valid_bytes(BENCHMARK_SIZE, 0); - if (null_percentage == -1) { - int n = {0}; - std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; }); - } else { - std::default_random_engine rng(500); - double valid_probability = 1.0 - (static_cast(null_percentage) / 100); - std::bernoulli_distribution dist(valid_probability); - std::generate(valid_bytes.begin(), valid_bytes.end(), - [&] { return static_cast(dist(rng)); }); - } + // Note true values select index 1 of sample_values + auto valid_bytes = RandomVector(/*true_percengate=*/null_percentage, + BENCHMARK_SIZE, /*sample_values=*/{1, 0}); EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data())); } else { EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), nullptr)); @@ -135,21 +143,11 @@ std::shared_ptr<::arrow::Table> TableFromVector( template <> std::shared_ptr<::arrow::Table> TableFromVector(const std::vector& vec, bool nullable, - int null_percentage) { + int64_t null_percentage) { BooleanBuilder builder; if (nullable) { - std::vector valid_bytes(BENCHMARK_SIZE, 0); - if (null_percentage == -1) { - int n = {0}; - std::generate(valid_bytes.begin(), valid_bytes.end(), - [&n] { return n++ % 2 != 0; }); - } else { - std::default_random_engine rng(500); - double valid_probability = 1.0 - (static_cast(null_percentage) / 100); - std::bernoulli_distribution dist(valid_probability); - std::generate(valid_bytes.begin(), valid_bytes.end(), - [&] { return static_cast(dist(rng)); }); - } + auto valid_bytes = RandomVector(/*true_percentage=*/null_percentage, + BENCHMARK_SIZE, {true, false}); EXIT_NOT_OK(builder.AppendValues(vec, valid_bytes)); } else { EXIT_NOT_OK(builder.AppendValues(vec)); @@ -189,16 +187,22 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType); BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType); BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType); +template +struct Examples { + static constexpr std::array values() { return {127, 128}; } +}; + +template <> +struct Examples { + static constexpr std::array values() { return {false, true}; } +}; + template static void BM_ReadColumn(::benchmark::State& state) { using T = typename ParquetType::c_type; - std::vector values(BENCHMARK_SIZE, static_cast(128)); - double value_probability = static_cast(state.range(1)) / 100.0; - std::default_random_engine rng(500); - std::bernoulli_distribution dist(value_probability); - std::generate(values.begin(), values.end(), - [&] { return static_cast(dist(rng) * 128); }); + auto values = RandomVector(/*percentage=*/state.range(1), BENCHMARK_SIZE, + Examples::values()); std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable, state.range(0)); @@ -219,6 +223,12 @@ static void BM_ReadColumn(::benchmark::State& state) { SetBytesProcessed(state); } +// There are two parameters here that cover different data distributions. +// null_percentage governs distribution and therefore runs of null values. +// first_value_percentage governs distribution of values (we select from 1 of 2) +// so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100) +// there will be some percentage of RLE encoded values and some percentage of literal +// encoded values (RLE is much less likely with percentages close to 50). BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type) ->Args({/*null_percentage=*/kAlternatingOrNa, 1}) ->Args({/*null_percentage=*/kAlternatingOrNa, 10}) @@ -226,13 +236,13 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type) BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type) ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) - ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10}) - ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50}) - ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25}) + ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1}) + ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10}) + ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5}) ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50}) - ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100}) + ->Args({/*null_percentage=*/50, /*first_value_percentage=*/0}) ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50}) - ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100}); + ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0}); BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type) ->Args({/*null_percentage=*/kAlternatingOrNa, 1}) @@ -240,22 +250,22 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type) ->Args({/*null_percentage=*/kAlternatingOrNa, 50}); BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type) ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0}) - ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10}) - ->Args({/*null_percentage=*/5, /*first_value_percentage=*/10}) - ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50}) - ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25}) - ->Args({/*null_percentage=*/30, /*first_value_percentage=*/25}) - ->Args({/*null_percentage=*/35, /*first_value_percentage=*/25}) + ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1}) + ->Args({/*null_percentage=*/5, /*first_value_percentage=*/5}) + ->Args({/*null_percentage=*/10, /*first_value_percentage=*/5}) + ->Args({/*null_percentage=*/25, /*first_value_percentage=*/10}) + ->Args({/*null_percentage=*/30, /*first_value_percentage=*/10}) + ->Args({/*null_percentage=*/35, /*first_value_percentage=*/10}) ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25}) ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50}) ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1}) ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1}) ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50}) - ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100}); + ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0}); BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType) ->Args({kAlternatingOrNa, 0}) - ->Args({1, 20}); + ->Args({kAlternatingOrNa, 20}); // Less coverage because int64_t should be pretty good representation for nullability and // repeating values. BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType) From 7f9c6dd7f4aebad62348f2c30001485d33a71b3a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 May 2020 02:46:48 +0000 Subject: [PATCH 5/6] ARROW_CHECK_EQ --- cpp/src/parquet/arrow/reader_writer_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 1328828018b..0bc3ac8cc34 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -120,7 +120,7 @@ std::shared_ptr<::arrow::Table> TableFromVector( const std::vector& vec, bool nullable, int64_t null_percentage = kAlternatingOrNa) { if (!nullable) { - DCHECK(null_percentage = kAlternatingOrNa); + ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa); } std::shared_ptr<::arrow::DataType> type = std::make_shared>(); NumericBuilder> builder; From e3fadadf86deeae2e836c2e137feb76664841ee1 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Tue, 19 May 2020 09:59:01 -0700 Subject: [PATCH 6/6] fix typo --- cpp/src/parquet/arrow/reader_writer_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 0bc3ac8cc34..bf3a93c3788 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -126,7 +126,7 @@ std::shared_ptr<::arrow::Table> TableFromVector( NumericBuilder> builder; if (nullable) { // Note true values select index 1 of sample_values - auto valid_bytes = RandomVector(/*true_percengate=*/null_percentage, + auto valid_bytes = RandomVector(/*true_percentage=*/null_percentage, BENCHMARK_SIZE, /*sample_values=*/{1, 0}); EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data())); } else {