From 999865b042c3131920b52b40a2387535168f3a08 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 17 Jun 2020 20:09:14 -0500 Subject: [PATCH 1/2] Add crude benchmark for filtering record batches --- .../kernels/vector_selection_benchmark.cc | 56 ++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index cc97c1cc678..cff80e0d341 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -18,6 +18,7 @@ #include "benchmark/benchmark.h" #include +#include #include "arrow/compute/api_vector.h" #include "arrow/compute/benchmark_util.h" @@ -69,7 +70,11 @@ std::vector g_filter_params = { // RAII struct to handle some of the boilerplate in filter struct FilterArgs { // size of memory tested (per iteration) in bytes - const int64_t size; + int64_t size; + + // What to call the "size" that's reported in the console output, for result + // interpretability. + std::string size_name = "size"; double values_null_proportion = 0.; double selected_proportion = 0.; @@ -84,7 +89,7 @@ struct FilterArgs { } ~FilterArgs() { - state_.counters["size"] = static_cast(size); + state_.counters[size_name] = static_cast(size); state_.counters["select%"] = selected_proportion * 100; state_.counters["data null%"] = values_null_proportion * 100; state_.counters["mask null%"] = filter_null_proportion * 100; @@ -198,6 +203,40 @@ struct FilterBenchmark { ABORT_NOT_OK(Filter(values, filter).status()); } } + + void BenchRecordBatch() { + const int64_t total_data_cells = 10000000; + const int64_t num_columns = state.range(0); + const int64_t num_rows = total_data_cells / num_columns; + + auto col_data = rand.Float64(num_rows, 0, 1); + + auto filter = + rand.Boolean(num_rows, args.selected_proportion, args.filter_null_proportion); + + int64_t output_length = + internal::GetFilterOutputSize(*filter->data(), FilterOptions::DROP); + + // HACK: set FilterArgs.size to the number of selected data cells * + // sizeof(double) for accurate memory processing performance + args.size = output_length * num_columns * sizeof(double); + args.size_name = "extracted_size"; + state.counters["num_cols"] = num_columns; + + std::vector> columns; + std::vector> fields; + for (int64_t i = 0; i < num_columns; ++i) { + std::stringstream ss; + ss << "f" << i; + fields.push_back(::arrow::field(ss.str(), float64())); + columns.push_back(col_data); + } + + auto batch = RecordBatch::Make(schema(fields), num_rows, columns); + for (auto _ : state) { + ABORT_NOT_OK(Filter(batch, filter).status()); + } + } }; static void FilterInt64FilterNoNulls(benchmark::State& state) { @@ -224,6 +263,10 @@ static void FilterStringFilterWithNulls(benchmark::State& state) { FilterBenchmark(state, true).String(); } +static void FilterRecordBatchNoNulls(benchmark::State& state) { + FilterBenchmark(state, false).BenchRecordBatch(); +} + static void TakeInt64RandomIndicesNoNulls(benchmark::State& state) { TakeBenchmark(state, false).Int64(); } @@ -275,6 +318,15 @@ BENCHMARK(FilterFSLInt64FilterWithNulls)->Apply(FilterSetArgs); BENCHMARK(FilterStringFilterNoNulls)->Apply(FilterSetArgs); BENCHMARK(FilterStringFilterWithNulls)->Apply(FilterSetArgs); +void FilterRecordBatchSetArgs(benchmark::internal::Benchmark* bench) { + for (auto num_cols : std::vector({10, 50, 100})) { + for (int i = 0; i < static_cast(g_filter_params.size()); ++i) { + bench->Args({num_cols, i}); + } + } +} +BENCHMARK(FilterRecordBatchNoNulls)->Apply(FilterRecordBatchSetArgs); + void TakeSetArgs(benchmark::internal::Benchmark* bench) { for (int64_t size : g_data_sizes) { for (auto nulls : std::vector({1000, 10, 2, 1, 0})) { From e4951aa04c5e492ed9925143741cac588dd2b5aa Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 18 Jun 2020 12:05:19 -0500 Subject: [PATCH 2/2] Appease MSVC --- cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index cff80e0d341..422088c09d3 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -221,7 +221,7 @@ struct FilterBenchmark { // sizeof(double) for accurate memory processing performance args.size = output_length * num_columns * sizeof(double); args.size_name = "extracted_size"; - state.counters["num_cols"] = num_columns; + state.counters["num_cols"] = static_cast(num_columns); std::vector> columns; std::vector> fields;