diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4c70a388faf..21043fc9870 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -269,6 +269,9 @@ else() add_custom_target(all-benchmarks) add_custom_target(benchmark ctest -L benchmark) add_dependencies(benchmark all-benchmarks) + if(ARROW_BUILD_BENCHMARKS_REFERENCE) + add_definitions(-DARROW_WITH_BENCHMARKS_REFERENCE) + endif() endif() if(NOT ARROW_BUILD_EXAMPLES) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 8ea7346cc98..8b3d80e04da 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -96,6 +96,11 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_BUILD_BENCHMARKS "Build the Arrow micro benchmarks, default OFF" OFF) + # Reference benchmarks are used to compare to naive implementation, or + # discover various hardware limits. + define_option(ARROW_BUILD_BENCHMARKS_REFERENCE + "Build the Arrow micro reference benchmarks, default OFF." OFF) + define_option_string(ARROW_TEST_LINKAGE "Linkage of Arrow libraries with unit tests executables." "shared" diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index 6ad860af000..bc36970f048 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -29,209 +29,168 @@ #include "arrow/memory_pool.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/bit-util.h" +#include "arrow/util/string_view.h" namespace arrow { -constexpr int64_t kFinalSize = 256; +using ValueType = int64_t; +using VectorType = std::vector; +constexpr int64_t kNumberOfElements = 256 * 512; -static void BM_BuildPrimitiveArrayNoNulls( - benchmark::State& state) { // NOLINT non-const reference - // 2 MiB block - std::vector data(256 * 1024, 100); - while (state.KeepRunning()) { +static VectorType AlmostU8CompressibleVector() { + VectorType data(kNumberOfElements, 64); + + // Insert an element late in the game that does not fit in the 8bit + // representation. This forces AdaptiveIntBuilder's to resize. + data[kNumberOfElements - 2] = 1L << 13; + + return data; +} + +constexpr int64_t kRounds = 256; +static VectorType kData = AlmostU8CompressibleVector(); +constexpr int64_t kBytesProcessPerRound = kNumberOfElements * sizeof(ValueType); +constexpr int64_t kBytesProcessed = kRounds * kBytesProcessPerRound; + +static const char* kBinaryString = "12345678"; +static arrow::util::string_view kBinaryView(kBinaryString); + +static void BuildIntArrayNoNulls(benchmark::State& state) { // NOLINT non-const reference + for (auto _ : state) { Int64Builder builder; - for (int i = 0; i < kFinalSize; i++) { - // Build up an array of 512 MiB in size - ABORT_NOT_OK(builder.AppendValues(data.data(), data.size(), nullptr)); + + for (int i = 0; i < kRounds; i++) { + ABORT_NOT_OK(builder.AppendValues(kData.data(), kData.size(), nullptr)); } + std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t) * - kFinalSize); -} -static void BM_BuildVectorNoNulls( - benchmark::State& state) { // NOLINT non-const reference - // 2 MiB block - std::vector data(256 * 1024, 100); - while (state.KeepRunning()) { - std::vector builder; - for (int i = 0; i < kFinalSize; i++) { - // Build up an array of 512 MiB in size - builder.insert(builder.end(), data.cbegin(), data.cend()); - } - } - state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t) * - kFinalSize); + state.SetBytesProcessed(state.iterations() * kBytesProcessed); } -static void BM_BuildAdaptiveIntNoNulls( +static void BuildAdaptiveIntNoNulls( benchmark::State& state) { // NOLINT non-const reference - int64_t size = static_cast(std::numeric_limits::max()) * 256; - int64_t chunk_size = size / 8; - std::vector data(size); - for (int64_t i = 0; i < size; i++) { - data[i] = i; - } - while (state.KeepRunning()) { + for (auto _ : state) { AdaptiveIntBuilder builder; - for (int64_t i = 0; i < size; i += chunk_size) { - // Build up an array of 128 MiB in size - ABORT_NOT_OK(builder.AppendValues(data.data() + i, chunk_size, nullptr)); + + for (int i = 0; i < kRounds; i++) { + ABORT_NOT_OK(builder.AppendValues(kData.data(), kData.size(), nullptr)); } + std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t)); + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); } -static void BM_BuildAdaptiveIntNoNullsScalarAppend( +static void BuildAdaptiveIntNoNullsScalarAppend( benchmark::State& state) { // NOLINT non-const reference - int64_t size = static_cast(std::numeric_limits::max()) * 256; - std::vector data(size); - for (int64_t i = 0; i < size; i++) { - data[i] = i; - } - while (state.KeepRunning()) { + for (auto _ : state) { AdaptiveIntBuilder builder; - for (int64_t i = 0; i < size; i++) { - ABORT_NOT_OK(builder.Append(data[i])); - } - std::shared_ptr out; - ABORT_NOT_OK(builder.Finish(&out)); - } - state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t)); -} -static void BM_BuildAdaptiveUIntNoNulls( - benchmark::State& state) { // NOLINT non-const reference - int64_t size = static_cast(std::numeric_limits::max()) * 256; - int64_t chunk_size = size / 8; - std::vector data(size); - for (uint64_t i = 0; i < static_cast(size); i++) { - data[i] = i; - } - while (state.KeepRunning()) { - AdaptiveUIntBuilder builder; - for (int64_t i = 0; i < size; i += chunk_size) { - // Build up an array of 128 MiB in size - ABORT_NOT_OK(builder.AppendValues(data.data() + i, chunk_size, nullptr)); + for (int i = 0; i < kRounds; i++) { + for (size_t j = 0; j < kData.size(); j++) { + ABORT_NOT_OK(builder.Append(kData[i])) + } } - std::shared_ptr out; - ABORT_NOT_OK(builder.Finish(&out)); - } - state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t)); -} -static void BM_BuildAdaptiveUIntNoNullsScalarAppend( - benchmark::State& state) { // NOLINT non-const reference - int64_t size = static_cast(std::numeric_limits::max()) * 256; - std::vector data(size); - for (uint64_t i = 0; i < static_cast(size); i++) { - data[i] = i; - } - while (state.KeepRunning()) { - AdaptiveUIntBuilder builder; - for (int64_t i = 0; i < size; i++) { - ABORT_NOT_OK(builder.Append(data[i])); - } std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t)); + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); } -static void BM_BuildBooleanArrayNoNulls( +static void BuildBooleanArrayNoNulls( benchmark::State& state) { // NOLINT non-const reference - // 2 MiB block - std::vector data(2 * 1024 * 1024); - constexpr uint8_t bit_pattern = 0xcc; // 0b11001100 - uint64_t index = 0; - std::generate(data.begin(), data.end(), - [&]() -> uint8_t { return (bit_pattern >> ((index++) % 8)) & 1; }); - - while (state.KeepRunning()) { + + size_t n_bytes = kBytesProcessPerRound; + const uint8_t* data = reinterpret_cast(kData.data()); + + for (auto _ : state) { BooleanBuilder builder; - for (int i = 0; i < kFinalSize; i++) { - // Build up an array of 512 MiB in size - ABORT_NOT_OK(builder.AppendValues(data.data(), data.size())); + + for (int i = 0; i < kRounds; i++) { + ABORT_NOT_OK(builder.AppendValues(data, n_bytes)); } + std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * data.size() * kFinalSize); -} -static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference - // About 160MB - const int64_t iterations = 1 << 24; - std::string value = "1234567890"; + state.SetBytesProcessed(state.iterations() * kBytesProcessed); +} +static void BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference for (auto _ : state) { BinaryBuilder builder; - for (int64_t i = 0; i < iterations; i++) { - ABORT_NOT_OK(builder.Append(value)); + + for (int64_t i = 0; i < kRounds * kNumberOfElements; i++) { + ABORT_NOT_OK(builder.Append(kBinaryView)); } + std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * iterations * value.size()); + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); } -static void BM_BuildChunkedBinaryArray( +static void BuildChunkedBinaryArray( benchmark::State& state) { // NOLINT non-const reference - // About 160MB - const int64_t iterations = 1 << 24; - std::string value = "1234567890"; + // 1MB chunks + const int32_t kChunkSize = 1 << 20; for (auto _ : state) { - // 1MB chunks - const int32_t chunksize = 1 << 20; - internal::ChunkedBinaryBuilder builder(chunksize); - for (int64_t i = 0; i < iterations; i++) { - ABORT_NOT_OK(builder.Append(reinterpret_cast(value.data()), - static_cast(value.size()))); + internal::ChunkedBinaryBuilder builder(kChunkSize); + + for (int64_t i = 0; i < kRounds * kNumberOfElements; i++) { + ABORT_NOT_OK(builder.Append(kBinaryView)); } + ArrayVector out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * iterations * value.size()); + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); } -static void BM_BuildFixedSizeBinaryArray( +static void BuildFixedSizeBinaryArray( benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1 << 20; - const int width = 10; + auto type = fixed_size_binary(kBinaryView.size()); - auto type = fixed_size_binary(width); - const char value[width + 1] = "1234567890"; - - while (state.KeepRunning()) { + for (auto _ : state) { FixedSizeBinaryBuilder builder(type); - for (int64_t i = 0; i < iterations; i++) { - ABORT_NOT_OK(builder.Append(value)); + + for (int64_t i = 0; i < kRounds * kNumberOfElements; i++) { + ABORT_NOT_OK(builder.Append(kBinaryView)); } + std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * iterations * width); + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); } // ---------------------------------------------------------------------- // DictionaryBuilder benchmarks +size_t kDistinctElements = kNumberOfElements / 100; + // Testing with different distributions of integer values helps stress // the hash table's robustness. // Make a vector out of `n_distinct` sequential int values -template -static std::vector MakeSequentialIntDictFodder(int32_t n_values, - int32_t n_distinct) { +template +static std::vector MakeSequentialIntDictFodder() { std::default_random_engine gen(42); - std::vector values(n_values); + std::vector values(kNumberOfElements); { - std::uniform_int_distribution values_dist(0, n_distinct - 1); + std::uniform_int_distribution values_dist(0, kDistinctElements - 1); std::generate(values.begin(), values.end(), [&]() { return values_dist(gen); }); } return values; @@ -239,15 +198,15 @@ static std::vector MakeSequentialIntDictFodder(int32_t n_values, // Make a vector out of `n_distinct` int values with potentially colliding hash // entries as only their highest bits differ. -template -static std::vector MakeSimilarIntDictFodder(int32_t n_values, - int32_t n_distinct) { +template +static std::vector MakeSimilarIntDictFodder() { std::default_random_engine gen(42); - std::vector values(n_values); + std::vector values(kNumberOfElements); { - std::uniform_int_distribution values_dist(0, n_distinct - 1); + std::uniform_int_distribution values_dist(0, kDistinctElements - 1); auto max_int = std::numeric_limits::max(); - auto multiplier = static_cast(BitUtil::NextPower2(max_int / n_distinct / 2)); + auto multiplier = + static_cast(BitUtil::NextPower2(max_int / kDistinctElements / 2)); std::generate(values.begin(), values.end(), [&]() { return multiplier * values_dist(gen); }); } @@ -255,12 +214,11 @@ static std::vector MakeSimilarIntDictFodder(int32_t n_values, } // Make a vector out of `n_distinct` random int values -template -static std::vector MakeRandomIntDictFodder(int32_t n_values, - int32_t n_distinct) { +template +static std::vector MakeRandomIntDictFodder() { std::default_random_engine gen(42); - std::vector values_dict(n_distinct); - std::vector values(n_values); + std::vector values_dict(kDistinctElements); + std::vector values(kNumberOfElements); { std::uniform_int_distribution values_dist( @@ -269,19 +227,18 @@ static std::vector MakeRandomIntDictFodder(int32_t n_values, [&]() { return static_cast(values_dist(gen)); }); } { - std::uniform_int_distribution indices_dist(0, n_distinct - 1); + std::uniform_int_distribution indices_dist(0, kDistinctElements - 1); std::generate(values.begin(), values.end(), [&]() { return values_dict[indices_dist(gen)]; }); } return values; } -// Make a vector out of `n_distinct` string values -static std::vector MakeStringDictFodder(int32_t n_values, - int32_t n_distinct) { +// Make a vector out of `kDistinctElements` string values +static std::vector MakeStringDictFodder() { std::default_random_engine gen(42); - std::vector values_dict(n_distinct); - std::vector values(n_values); + std::vector values_dict(kDistinctElements); + std::vector values(kNumberOfElements); { auto it = values_dict.begin(); @@ -305,7 +262,7 @@ static std::vector MakeStringDictFodder(int32_t n_values, }); } { - std::uniform_int_distribution indices_dist(0, n_distinct - 1); + std::uniform_int_distribution indices_dist(0, kDistinctElements - 1); std::generate(values.begin(), values.end(), [&] { return values_dict[indices_dist(gen)]; }); } @@ -316,55 +273,64 @@ template static void BenchmarkScalarDictionaryArray( benchmark::State& state, // NOLINT non-const reference const std::vector& fodder) { - while (state.KeepRunning()) { + for (auto _ : state) { DictionaryBuilder builder(default_memory_pool()); - for (const auto value : fodder) { - ABORT_NOT_OK(builder.Append(value)); + + for (int64_t i = 0; i < kRounds; i++) { + for (const auto value : fodder) { + ABORT_NOT_OK(builder.Append(value)); + } } + std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * fodder.size() * sizeof(Scalar)); + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); } -static void BM_BuildInt64DictionaryArrayRandom( +static void BuildInt64DictionaryArrayRandom( benchmark::State& state) { // NOLINT non-const reference - const auto fodder = MakeRandomIntDictFodder(10000, 100); + const auto fodder = MakeRandomIntDictFodder(); BenchmarkScalarDictionaryArray>(state, fodder); } -static void BM_BuildInt64DictionaryArraySequential( +static void BuildInt64DictionaryArraySequential( benchmark::State& state) { // NOLINT non-const reference - const auto fodder = MakeSequentialIntDictFodder(10000, 100); + const auto fodder = MakeSequentialIntDictFodder(); BenchmarkScalarDictionaryArray>(state, fodder); } -static void BM_BuildInt64DictionaryArraySimilar( +static void BuildInt64DictionaryArraySimilar( benchmark::State& state) { // NOLINT non-const reference - const auto fodder = MakeSimilarIntDictFodder(10000, 100); + const auto fodder = MakeSimilarIntDictFodder(); BenchmarkScalarDictionaryArray>(state, fodder); } -static void BM_BuildStringDictionaryArray( +static void BuildStringDictionaryArray( benchmark::State& state) { // NOLINT non-const reference - const auto fodder = MakeStringDictFodder(10000, 100); - auto type = binary(); + const auto fodder = MakeStringDictFodder(); auto fodder_size = - std::accumulate(fodder.begin(), fodder.end(), static_cast(0), + std::accumulate(fodder.begin(), fodder.end(), 0UL, [&](size_t acc, const std::string& s) { return acc + s.size(); }); - while (state.KeepRunning()) { + for (auto _ : state) { BinaryDictionaryBuilder builder(default_memory_pool()); - for (const auto& value : fodder) { - ABORT_NOT_OK(builder.Append(value)); + + for (int64_t i = 0; i < kRounds; i++) { + for (const auto& value : fodder) { + ABORT_NOT_OK(builder.Append(value)); + } } + std::shared_ptr out; ABORT_NOT_OK(builder.Finish(&out)); } - state.SetBytesProcessed(state.iterations() * fodder_size); + + state.SetBytesProcessed(state.iterations() * fodder_size * kRounds); } -static void BM_ArrayDataConstructDestruct( +static void ArrayDataConstructDestruct( benchmark::State& state) { // NOLINT non-const reference std::vector> arrays; @@ -383,51 +349,44 @@ static void BM_ArrayDataConstructDestruct( // ---------------------------------------------------------------------- // Benchmark declarations +// + +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + +// This benchmarks acts as a reference to the native std::vector +// implementation. It appends kRounds chunks into a vector. +static void ReferenceBuildVectorNoNulls( + benchmark::State& state) { // NOLINT non-const reference + for (auto _ : state) { + std::vector builder; + + for (int i = 0; i < kRounds; i++) { + builder.insert(builder.end(), kData.cbegin(), kData.cend()); + } + } + + state.SetBytesProcessed(state.iterations() * kBytesProcessed); +} + +BENCHMARK(ReferenceBuildVectorNoNulls); + +#endif + +BENCHMARK(BuildBooleanArrayNoNulls); + +BENCHMARK(BuildIntArrayNoNulls); +BENCHMARK(BuildAdaptiveIntNoNulls); +BENCHMARK(BuildAdaptiveIntNoNullsScalarAppend); + +BENCHMARK(BuildBinaryArray); +BENCHMARK(BuildChunkedBinaryArray); +BENCHMARK(BuildFixedSizeBinaryArray); + +BENCHMARK(BuildInt64DictionaryArrayRandom); +BENCHMARK(BuildInt64DictionaryArraySequential); +BENCHMARK(BuildInt64DictionaryArraySimilar); +BENCHMARK(BuildStringDictionaryArray); -static constexpr int32_t kRepetitions = 2; - -BENCHMARK(BM_ArrayDataConstructDestruct); - -BENCHMARK(BM_BuildPrimitiveArrayNoNulls) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildVectorNoNulls) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_BuildBooleanArrayNoNulls) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_BuildAdaptiveIntNoNulls) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildAdaptiveIntNoNullsScalarAppend) - ->Repetitions(3) - ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildAdaptiveUIntNoNulls) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildAdaptiveUIntNoNullsScalarAppend) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_BuildBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildChunkedBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildFixedSizeBinaryArray)->MinTime(3.0)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_BuildInt64DictionaryArrayRandom) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildInt64DictionaryArraySequential) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildInt64DictionaryArraySimilar) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_BuildStringDictionaryArray) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); +BENCHMARK(ArrayDataConstructDestruct); } // namespace arrow diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc index 3ae83f6a210..bb2c63179ab 100644 --- a/cpp/src/arrow/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -36,7 +36,7 @@ Status MakePrimitive(int64_t length, int64_t null_count, std::shared_ptr* } } // anonymous namespace -static void BM_BuildInt32ColumnByChunk( +static void BuildInt32ColumnByChunk( benchmark::State& state) { // NOLINT non-const reference ArrayVector arrays; for (int chunk_n = 0; chunk_n < state.range(0); ++chunk_n) { @@ -52,6 +52,6 @@ static void BM_BuildInt32ColumnByChunk( } } -BENCHMARK(BM_BuildInt32ColumnByChunk)->Range(5, 50000); +BENCHMARK(BuildInt32ColumnByChunk)->Range(5, 50000); } // namespace arrow diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h index 865da6671e3..ee9cb9504a3 100644 --- a/cpp/src/arrow/compute/benchmark-util.h +++ b/cpp/src/arrow/compute/benchmark-util.h @@ -32,40 +32,42 @@ static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::L1_CACHE); static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::L2_CACHE); static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::L3_CACHE); static const int64_t kCantFitInL3Size = kL3Size * 4; +static const std::vector kMemorySizes = {kL1Size, kL2Size, kL3Size, + kCantFitInL3Size}; template struct BenchmarkArgsType; +// Pattern matching that extracts the vector element type of Benchmark::Args() template struct BenchmarkArgsType&)> { using type = Values; }; -void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) { - // Benchmark changed its parameter type between releases from - // int to int64_t. As it doesn't have version macros, we need - // to apply C++ template magic. - using ArgsType = - typename BenchmarkArgsType::type; +// Benchmark changed its parameter type between releases from +// int to int64_t. As it doesn't have version macros, we need +// to apply C++ template magic. +using ArgsType = + typename BenchmarkArgsType::type; + +void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench, + const std::vector& sizes = kMemorySizes) { bench->Unit(benchmark::kMicrosecond); - for (auto size : {kL1Size, kL2Size, kL3Size, kCantFitInL3Size}) + for (auto size : sizes) for (auto nulls : std::vector({0, 1, 10, 50})) bench->Args({static_cast(size), nulls}); } -void RegressionSetArgs(benchmark::internal::Benchmark* bench) { - // Benchmark changed its parameter type between releases from - // int to int64_t. As it doesn't have version macros, we need - // to apply C++ template magic. - using ArgsType = - typename BenchmarkArgsType::type; - bench->Unit(benchmark::kMicrosecond); +void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) { + BenchmarkSetArgsWithSizes(bench, kMemorySizes); +} - // Regressions should only bench L1 data for better stability - for (auto nulls : std::vector({0, 1, 10, 50})) - bench->Args({static_cast(kL1Size), nulls}); +void RegressionSetArgs(benchmark::internal::Benchmark* bench) { + // Regression do not need to account for cache hierarchy, thus optimize for + // the best case. + BenchmarkSetArgsWithSizes(bench, {kL1Size}); } } // namespace compute diff --git a/cpp/src/arrow/compute/compute-benchmark.cc b/cpp/src/arrow/compute/compute-benchmark.cc index c14f706c445..449504121f5 100644 --- a/cpp/src/arrow/compute/compute-benchmark.cc +++ b/cpp/src/arrow/compute/compute-benchmark.cc @@ -31,7 +31,7 @@ namespace arrow { namespace compute { -static void BM_BuildDictionary(benchmark::State& state) { // NOLINT non-const reference +static void BuildDictionary(benchmark::State& state) { // NOLINT non-const reference const int64_t iterations = 1024; std::vector values; @@ -55,7 +55,7 @@ static void BM_BuildDictionary(benchmark::State& state) { // NOLINT non-const r state.SetBytesProcessed(state.iterations() * values.size() * sizeof(int64_t)); } -static void BM_BuildStringDictionary( +static void BuildStringDictionary( benchmark::State& state) { // NOLINT non-const reference const int64_t iterations = 1024 * 64; // Pre-render strings @@ -172,62 +172,52 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params, state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length)); } -static void BM_UniqueUInt8NoNulls(benchmark::State& state) { +static void UniqueUInt8NoNulls(benchmark::State& state) { BenchUnique(state, HashParams{0}, state.range(0), state.range(1)); } -static void BM_UniqueUInt8WithNulls(benchmark::State& state) { +static void UniqueUInt8WithNulls(benchmark::State& state) { BenchUnique(state, HashParams{0.05}, state.range(0), state.range(1)); } -static void BM_UniqueInt64NoNulls(benchmark::State& state) { +static void UniqueInt64NoNulls(benchmark::State& state) { BenchUnique(state, HashParams{0}, state.range(0), state.range(1)); } -static void BM_UniqueInt64WithNulls(benchmark::State& state) { +static void UniqueInt64WithNulls(benchmark::State& state) { BenchUnique(state, HashParams{0.05}, state.range(0), state.range(1)); } -static void BM_UniqueString10bytes(benchmark::State& state) { +static void UniqueString10bytes(benchmark::State& state) { // Byte strings with 10 bytes each BenchUnique(state, HashParams{0.05, 10}, state.range(0), state.range(1)); } -static void BM_UniqueString100bytes(benchmark::State& state) { +static void UniqueString100bytes(benchmark::State& state) { // Byte strings with 100 bytes each BenchUnique(state, HashParams{0.05, 100}, state.range(0), state.range(1)); } -BENCHMARK(BM_BuildDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildStringDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BuildDictionary); +BENCHMARK(BuildStringDictionary); -constexpr int kHashBenchmarkLength = 1 << 24; +constexpr int kHashBenchmarkLength = 1 << 22; -#define ADD_HASH_ARGS(WHAT) \ - WHAT->Args({kHashBenchmarkLength, 50}) \ - ->Args({kHashBenchmarkLength, 1 << 10}) \ - ->Args({kHashBenchmarkLength, 10 * 1 << 10}) \ - ->Args({kHashBenchmarkLength, 1 << 20}) \ - ->MinTime(1.0) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseRealTime() +#define ADD_HASH_ARGS(WHAT) \ + WHAT->Args({kHashBenchmarkLength, 1 << 10})->Args({kHashBenchmarkLength, 10 * 1 << 10}) -ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls)); -ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls)); -ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes)); -ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes)); +ADD_HASH_ARGS(BENCHMARK(UniqueInt64NoNulls)); +ADD_HASH_ARGS(BENCHMARK(UniqueInt64WithNulls)); +ADD_HASH_ARGS(BENCHMARK(UniqueString10bytes)); +ADD_HASH_ARGS(BENCHMARK(UniqueString100bytes)); -BENCHMARK(BM_UniqueUInt8NoNulls) +BENCHMARK(UniqueUInt8NoNulls) ->Args({kHashBenchmarkLength, 200}) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond) - ->UseRealTime(); + ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_UniqueUInt8WithNulls) +BENCHMARK(UniqueUInt8WithNulls) ->Args({kHashBenchmarkLength, 200}) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond) - ->UseRealTime(); + ->Unit(benchmark::kMicrosecond); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc index 085843e700d..cc2d3aa58dd 100644 --- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc @@ -37,6 +37,8 @@ namespace compute { #include #include +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + namespace BitUtil = arrow::BitUtil; using arrow::internal::BitmapReader; @@ -273,7 +275,7 @@ struct SumBitmapVectorizeUnroll : public Summer { }; template -void BenchSum(benchmark::State& state) { +void ReferenceSum(benchmark::State& state) { using T = typename Functor::ValueType; const int64_t array_size = state.range(0) / sizeof(int64_t); @@ -295,15 +297,17 @@ void BenchSum(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * array_size * sizeof(T)); } -BENCHMARK_TEMPLATE(BenchSum, SumNoNulls)->Apply(BenchmarkSetArgs); -BENCHMARK_TEMPLATE(BenchSum, SumNoNullsUnrolled)->Apply(BenchmarkSetArgs); -BENCHMARK_TEMPLATE(BenchSum, SumSentinel)->Apply(BenchmarkSetArgs); -BENCHMARK_TEMPLATE(BenchSum, SumSentinelUnrolled)->Apply(BenchmarkSetArgs); -BENCHMARK_TEMPLATE(BenchSum, SumBitmapNaive)->Apply(BenchmarkSetArgs); -BENCHMARK_TEMPLATE(BenchSum, SumBitmapReader)->Apply(BenchmarkSetArgs); -BENCHMARK_TEMPLATE(BenchSum, SumBitmapVectorizeUnroll)->Apply(BenchmarkSetArgs); - -static void RegressionSumKernel(benchmark::State& state) { +BENCHMARK_TEMPLATE(ReferenceSum, SumNoNulls)->Apply(BenchmarkSetArgs); +BENCHMARK_TEMPLATE(ReferenceSum, SumNoNullsUnrolled)->Apply(BenchmarkSetArgs); +BENCHMARK_TEMPLATE(ReferenceSum, SumSentinel)->Apply(BenchmarkSetArgs); +BENCHMARK_TEMPLATE(ReferenceSum, SumSentinelUnrolled)->Apply(BenchmarkSetArgs); +BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapNaive)->Apply(BenchmarkSetArgs); +BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapReader)->Apply(BenchmarkSetArgs); +BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) + ->Apply(BenchmarkSetArgs); +#endif // ARROW_WITH_BENCHMARKS_REFERENCE + +static void SumKernel(benchmark::State& state) { const int64_t array_size = state.range(0) / sizeof(int64_t); const double null_percent = static_cast(state.range(1)) / 100.0; auto rand = random::RandomArrayGenerator(1923); @@ -322,7 +326,7 @@ static void RegressionSumKernel(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); } -BENCHMARK(RegressionSumKernel)->Apply(RegressionSetArgs); +BENCHMARK(SumKernel)->Apply(RegressionSetArgs); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/filter-benchmark.cc b/cpp/src/arrow/compute/kernels/filter-benchmark.cc index 3826e261a7a..24e18415ac8 100644 --- a/cpp/src/arrow/compute/kernels/filter-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/filter-benchmark.cc @@ -29,7 +29,7 @@ namespace arrow { namespace compute { -static void BenchCompareKernel(benchmark::State& state) { +static void CompareArrayScalarKernel(benchmark::State& state) { const int64_t memory_size = state.range(0) / 4; const int64_t array_size = memory_size / sizeof(int64_t); const double null_percent = static_cast(state.range(1)) / 100.0; @@ -37,7 +37,7 @@ static void BenchCompareKernel(benchmark::State& state) { auto array = std::static_pointer_cast>( rand.Int64(array_size, -100, 100, null_percent)); - CompareOptions ge(GREATER_EQUAL); + CompareOptions ge{GREATER_EQUAL}; FunctionContext ctx; for (auto _ : state) { @@ -51,7 +51,7 @@ static void BenchCompareKernel(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); } -BENCHMARK(BenchCompareKernel)->Apply(BenchmarkSetArgs); +BENCHMARK(CompareArrayScalarKernel)->Apply(RegressionSetArgs); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/csv/converter-benchmark.cc b/cpp/src/arrow/csv/converter-benchmark.cc index c43fce66b28..e128e7b3f8e 100644 --- a/cpp/src/arrow/csv/converter-benchmark.cc +++ b/cpp/src/arrow/csv/converter-benchmark.cc @@ -74,24 +74,24 @@ static void BenchmarkConversion(benchmark::State& state, // NOLINT non-const re state.SetItemsProcessed(state.iterations() * parser.num_rows()); } -static void BM_Int64Conversion(benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 10000; +constexpr size_t num_rows = 10000; + +static void Int64Conversion(benchmark::State& state) { // NOLINT non-const reference auto parser = BuildInt64Data(num_rows); auto options = ConvertOptions::Defaults(); BenchmarkConversion(state, *parser, int64(), options); } -static void BM_FloatConversion(benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 10000; +static void FloatConversion(benchmark::State& state) { // NOLINT non-const reference auto parser = BuildFloatData(num_rows); auto options = ConvertOptions::Defaults(); BenchmarkConversion(state, *parser, float64(), options); } -BENCHMARK(BM_Int64Conversion)->Repetitions(3); -BENCHMARK(BM_FloatConversion)->Repetitions(3); +BENCHMARK(Int64Conversion); +BENCHMARK(FloatConversion); } // namespace csv } // namespace arrow diff --git a/cpp/src/arrow/csv/parser-benchmark.cc b/cpp/src/arrow/csv/parser-benchmark.cc index 8dcb06bd3cc..c474af5c97a 100644 --- a/cpp/src/arrow/csv/parser-benchmark.cc +++ b/cpp/src/arrow/csv/parser-benchmark.cc @@ -28,20 +28,17 @@ namespace arrow { namespace csv { -static std::string BuildQuotedData(int32_t num_rows = 10000) { - std::string one_row = "abc,\"d,f\",12.34,\n"; - std::stringstream ss; - for (int32_t i = 0; i < num_rows; ++i) { - ss << one_row; - } - return ss.str(); -} +// Linter stipulates: +// >> For a static/global string constant, use a C style string instead +const char* one_row = "abc,\"d,f\",12.34,\n"; +const char* one_row_escaped = "abc,d\\,f,12.34,\n"; -static std::string BuildEscapedData(int32_t num_rows = 10000) { - std::string one_row = "abc,d\\,f,12.34,\n"; +size_t num_rows = (1024 * 64) / strlen(one_row); + +static std::string BuildCSVData(const std::string& row, size_t repeat) { std::stringstream ss; - for (int32_t i = 0; i < num_rows; ++i) { - ss << one_row; + for (size_t i = 0; i < repeat; ++i) { + ss << row; } return ss.str(); } @@ -49,23 +46,19 @@ static std::string BuildEscapedData(int32_t num_rows = 10000) { static void BenchmarkCSVChunking(benchmark::State& state, // NOLINT non-const reference const std::string& csv, ParseOptions options) { Chunker chunker(options); + const uint32_t csv_size = static_cast(csv.size()); while (state.KeepRunning()) { - uint32_t chunk_size; - ABORT_NOT_OK( - chunker.Process(csv.data(), static_cast(csv.size()), &chunk_size)); - if (chunk_size != csv.size()) { - std::cerr << "Parsing incomplete\n"; - std::abort(); - } + uint32_t chunk_size = 0; + ABORT_NOT_OK(chunker.Process(csv.data(), csv_size, &chunk_size)); + benchmark::DoNotOptimize(chunk_size); } - state.SetBytesProcessed(state.iterations() * csv.size()); + + state.SetBytesProcessed(state.iterations() * csv_size); } -static void BM_ChunkCSVQuotedBlock( - benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 5000; - auto csv = BuildQuotedData(num_rows); +static void ChunkCSVQuotedBlock(benchmark::State& state) { // NOLINT non-const reference + auto csv = BuildCSVData(one_row, num_rows); auto options = ParseOptions::Defaults(); options.quoting = true; options.escaping = false; @@ -74,10 +67,8 @@ static void BM_ChunkCSVQuotedBlock( BenchmarkCSVChunking(state, csv, options); } -static void BM_ChunkCSVEscapedBlock( - benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 5000; - auto csv = BuildEscapedData(num_rows); +static void ChunkCSVEscapedBlock(benchmark::State& state) { // NOLINT non-const reference + auto csv = BuildCSVData(one_row_escaped, num_rows); auto options = ParseOptions::Defaults(); options.quoting = false; options.escaping = true; @@ -86,31 +77,30 @@ static void BM_ChunkCSVEscapedBlock( BenchmarkCSVChunking(state, csv, options); } -static void BM_ChunkCSVNoNewlinesBlock( +static void ChunkCSVNoNewlinesBlock( benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 5000; - auto csv = BuildEscapedData(num_rows); + auto csv = BuildCSVData(one_row_escaped, num_rows); auto options = ParseOptions::Defaults(); options.quoting = true; options.escaping = false; options.newlines_in_values = false; BenchmarkCSVChunking(state, csv, options); + // Provides better regression stability with timings rather than bogus + // bandwidth. + state.SetBytesProcessed(0); } static void BenchmarkCSVParsing(benchmark::State& state, // NOLINT non-const reference - const std::string& csv, int32_t num_rows, + const std::string& csv, int32_t rows, ParseOptions options) { - BlockParser parser(options, -1, num_rows + 1); + BlockParser parser(options, -1, rows + 1); + const uint32_t csv_size = static_cast(csv.size()); while (state.KeepRunning()) { - uint32_t parsed_size; - ABORT_NOT_OK( - parser.Parse(csv.data(), static_cast(csv.size()), &parsed_size)); - if (parsed_size != csv.size() || parser.num_rows() != num_rows) { - std::cerr << "Parsing incomplete\n"; - std::abort(); - } + uint32_t parsed_size = 0; + ABORT_NOT_OK(parser.Parse(csv.data(), csv_size, &parsed_size)); + // Include performance of visiting the parsed values, as that might // vary depending on the parser's internal data structures. bool dummy_quoted = false; @@ -126,13 +116,12 @@ static void BenchmarkCSVParsing(benchmark::State& state, // NOLINT non-const re benchmark::DoNotOptimize(dummy_quoted); } } - state.SetBytesProcessed(state.iterations() * csv.size()); + + state.SetBytesProcessed(state.iterations() * csv_size); } -static void BM_ParseCSVQuotedBlock( - benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 5000; - auto csv = BuildQuotedData(num_rows); +static void ParseCSVQuotedBlock(benchmark::State& state) { // NOLINT non-const reference + auto csv = BuildCSVData(one_row, num_rows); auto options = ParseOptions::Defaults(); options.quoting = true; options.escaping = false; @@ -140,10 +129,8 @@ static void BM_ParseCSVQuotedBlock( BenchmarkCSVParsing(state, csv, num_rows, options); } -static void BM_ParseCSVEscapedBlock( - benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 5000; - auto csv = BuildEscapedData(num_rows); +static void ParseCSVEscapedBlock(benchmark::State& state) { // NOLINT non-const reference + auto csv = BuildCSVData(one_row_escaped, num_rows); auto options = ParseOptions::Defaults(); options.quoting = false; options.escaping = true; @@ -151,11 +138,11 @@ static void BM_ParseCSVEscapedBlock( BenchmarkCSVParsing(state, csv, num_rows, options); } -BENCHMARK(BM_ChunkCSVQuotedBlock)->Repetitions(3)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_ChunkCSVEscapedBlock)->Repetitions(3)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_ChunkCSVNoNewlinesBlock)->Repetitions(3)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_ParseCSVQuotedBlock)->Repetitions(3)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_ParseCSVEscapedBlock)->Repetitions(3)->Unit(benchmark::kMicrosecond); +BENCHMARK(ChunkCSVQuotedBlock); +BENCHMARK(ChunkCSVEscapedBlock); +BENCHMARK(ChunkCSVNoNewlinesBlock); +BENCHMARK(ParseCSVQuotedBlock); +BENCHMARK(ParseCSVEscapedBlock); } // namespace csv } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda-benchmark.cc b/cpp/src/arrow/gpu/cuda-benchmark.cc index a61eb921e91..267d64a1776 100644 --- a/cpp/src/arrow/gpu/cuda-benchmark.cc +++ b/cpp/src/arrow/gpu/cuda-benchmark.cc @@ -65,7 +65,7 @@ static void CudaBufferWriterBenchmark(benchmark::State& state, const int64_t tot state.SetBytesProcessed(int64_t(state.iterations()) * total_bytes); } -static void BM_Writer_Buffered(benchmark::State& state) { +static void Writer_Buffered(benchmark::State& state) { // 128MB const int64_t kTotalBytes = 1 << 27; @@ -75,24 +75,19 @@ static void BM_Writer_Buffered(benchmark::State& state) { CudaBufferWriterBenchmark(state, kTotalBytes, state.range(0), kBufferSize); } -static void BM_Writer_Unbuffered(benchmark::State& state) { +static void Writer_Unbuffered(benchmark::State& state) { // 128MB const int64_t kTotalBytes = 1 << 27; CudaBufferWriterBenchmark(state, kTotalBytes, state.range(0), 0); } // Vary chunk write size from 256 bytes to 64K -BENCHMARK(BM_Writer_Buffered) - ->RangeMultiplier(16) - ->Range(1 << 8, 1 << 16) - ->MinTime(1.0) - ->UseRealTime(); +BENCHMARK(Writer_Buffered)->RangeMultiplier(16)->Range(1 << 8, 1 << 16)->UseRealTime(); -BENCHMARK(BM_Writer_Unbuffered) +BENCHMARK(Writer_Unbuffered) ->RangeMultiplier(4) ->RangeMultiplier(16) ->Range(1 << 8, 1 << 16) - ->MinTime(1.0) ->UseRealTime(); } // namespace cuda diff --git a/cpp/src/arrow/io/file-benchmark.cc b/cpp/src/arrow/io/file-benchmark.cc index b4344238a1f..74b92cbf3d6 100644 --- a/cpp/src/arrow/io/file-benchmark.cc +++ b/cpp/src/arrow/io/file-benchmark.cc @@ -155,7 +155,7 @@ static void BenchmarkStreamingWrites(benchmark::State& state, // This situation is irrealistic as the kernel likely doesn't // copy the data at all, so we only measure small writes. -static void BM_FileOutputStreamSmallWritesToNull( +static void FileOutputStreamSmallWritesToNull( benchmark::State& state) { // NOLINT non-const reference std::shared_ptr stream; ABORT_NOT_OK(io::FileOutputStream::Open(GetNullFile(), &stream)); @@ -163,7 +163,7 @@ static void BM_FileOutputStreamSmallWritesToNull( BenchmarkStreamingWrites(state, small_sizes, stream.get()); } -static void BM_BufferedOutputStreamSmallWritesToNull( +static void BufferedOutputStreamSmallWritesToNull( benchmark::State& state) { // NOLINT non-const reference std::shared_ptr file; ABORT_NOT_OK(io::FileOutputStream::Open(GetNullFile(), &file)); @@ -178,7 +178,7 @@ static void BM_BufferedOutputStreamSmallWritesToNull( // // This is slightly more realistic than the above -static void BM_FileOutputStreamSmallWritesToPipe( +static void FileOutputStreamSmallWritesToPipe( benchmark::State& state) { // NOLINT non-const reference std::shared_ptr stream; std::shared_ptr reader; @@ -187,7 +187,7 @@ static void BM_FileOutputStreamSmallWritesToPipe( BenchmarkStreamingWrites(state, small_sizes, stream.get(), reader.get()); } -static void BM_FileOutputStreamLargeWritesToPipe( +static void FileOutputStreamLargeWritesToPipe( benchmark::State& state) { // NOLINT non-const reference std::shared_ptr stream; std::shared_ptr reader; @@ -196,7 +196,7 @@ static void BM_FileOutputStreamLargeWritesToPipe( BenchmarkStreamingWrites(state, large_sizes, stream.get(), reader.get()); } -static void BM_BufferedOutputStreamSmallWritesToPipe( +static void BufferedOutputStreamSmallWritesToPipe( benchmark::State& state) { // NOLINT non-const reference std::shared_ptr stream; std::shared_ptr reader; @@ -208,7 +208,7 @@ static void BM_BufferedOutputStreamSmallWritesToPipe( BenchmarkStreamingWrites(state, small_sizes, buffered_stream.get(), reader.get()); } -static void BM_BufferedOutputStreamLargeWritesToPipe( +static void BufferedOutputStreamLargeWritesToPipe( benchmark::State& state) { // NOLINT non-const reference std::shared_ptr stream; std::shared_ptr reader; @@ -224,31 +224,13 @@ static void BM_BufferedOutputStreamLargeWritesToPipe( // We use real time as we don't want to count CPU time spent in the // BackgroundReader thread -BENCHMARK(BM_FileOutputStreamSmallWritesToNull) - ->Repetitions(2) - ->MinTime(1.0) - ->UseRealTime(); -BENCHMARK(BM_FileOutputStreamSmallWritesToPipe) - ->Repetitions(2) - ->MinTime(1.0) - ->UseRealTime(); -BENCHMARK(BM_FileOutputStreamLargeWritesToPipe) - ->Repetitions(2) - ->MinTime(1.0) - ->UseRealTime(); - -BENCHMARK(BM_BufferedOutputStreamSmallWritesToNull) - ->Repetitions(2) - ->MinTime(1.0) - ->UseRealTime(); -BENCHMARK(BM_BufferedOutputStreamSmallWritesToPipe) - ->Repetitions(2) - ->MinTime(1.0) - ->UseRealTime(); -BENCHMARK(BM_BufferedOutputStreamLargeWritesToPipe) - ->Repetitions(2) - ->MinTime(1.0) - ->UseRealTime(); +BENCHMARK(FileOutputStreamSmallWritesToNull)->UseRealTime(); +BENCHMARK(FileOutputStreamSmallWritesToPipe)->UseRealTime(); +BENCHMARK(FileOutputStreamLargeWritesToPipe)->UseRealTime(); + +BENCHMARK(BufferedOutputStreamSmallWritesToNull)->UseRealTime(); +BENCHMARK(BufferedOutputStreamSmallWritesToPipe)->UseRealTime(); +BENCHMARK(BufferedOutputStreamLargeWritesToPipe)->UseRealTime(); #endif // ifndef _WIN32 diff --git a/cpp/src/arrow/io/memory-benchmark.cc b/cpp/src/arrow/io/memory-benchmark.cc index 78389574b63..a3676e41f2f 100644 --- a/cpp/src/arrow/io/memory-benchmark.cc +++ b/cpp/src/arrow/io/memory-benchmark.cc @@ -44,6 +44,7 @@ static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::L3_CACHE); constexpr size_t kMemoryPerCore = 32 * 1024 * 1024; using BufferPtr = std::shared_ptr; +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE #ifndef _MSC_VER #ifdef ARROW_AVX512 @@ -200,6 +201,7 @@ BENCHMARK_TEMPLATE(MemoryBandwidth, StreamReadWrite)->Apply(SetMemoryBandwidthAr BENCHMARK_TEMPLATE(MemoryBandwidth, PlatformMemcpy)->Apply(SetMemoryBandwidthArgs); #endif // _MSC_VER +#endif // ARROW_WITH_BENCHMARKS_REFERENCE static void ParallelMemoryCopy(benchmark::State& state) { // NOLINT non-const reference const int64_t n_threads = state.range(0); diff --git a/cpp/src/arrow/ipc/read-write-benchmark.cc b/cpp/src/arrow/ipc/read-write-benchmark.cc index 66d45fb0127..6f66f9c4926 100644 --- a/cpp/src/arrow/ipc/read-write-benchmark.cc +++ b/cpp/src/arrow/ipc/read-write-benchmark.cc @@ -47,7 +47,7 @@ std::shared_ptr MakeRecordBatch(int64_t total_size, int64_t num_fie return RecordBatch::Make(schema, length, arrays); } -static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const reference +static void WriteRecordBatch(benchmark::State& state) { // NOLINT non-const reference // 1MB constexpr int64_t kTotalSize = 1 << 20; @@ -68,7 +68,7 @@ static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); } -static void BM_ReadRecordBatch(benchmark::State& state) { // NOLINT non-const reference +static void ReadRecordBatch(benchmark::State& state) { // NOLINT non-const reference // 1MB constexpr int64_t kTotalSize = 1 << 20; @@ -99,16 +99,7 @@ static void BM_ReadRecordBatch(benchmark::State& state) { // NOLINT non-const r state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); } -BENCHMARK(BM_WriteRecordBatch) - ->RangeMultiplier(4) - ->Range(1, 1 << 13) - ->MinTime(1.0) - ->UseRealTime(); - -BENCHMARK(BM_ReadRecordBatch) - ->RangeMultiplier(4) - ->Range(1, 1 << 13) - ->MinTime(1.0) - ->UseRealTime(); +BENCHMARK(WriteRecordBatch)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); +BENCHMARK(ReadRecordBatch)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); } // namespace arrow diff --git a/cpp/src/arrow/json/parser-benchmark.cc b/cpp/src/arrow/json/parser-benchmark.cc index b186f069612..66ef9ece425 100644 --- a/cpp/src/arrow/json/parser-benchmark.cc +++ b/cpp/src/arrow/json/parser-benchmark.cc @@ -30,57 +30,64 @@ namespace arrow { namespace json { -static void BenchmarkJSONChunking(benchmark::State& state, // NOLINT non-const reference +std::shared_ptr TestSchema() { + return schema({field("int", int32()), field("str", utf8())}); +} + +constexpr int seed = 0x432432; + +std::string TestJsonData(int num_rows, bool pretty = false) { + std::default_random_engine engine(seed); + std::string json; + for (int i = 0; i < num_rows; ++i) { + StringBuffer sb; + Writer writer(sb); + ABORT_NOT_OK(Generate(TestSchema(), engine, &writer)); + json += pretty ? PrettyPrint(sb.GetString()) : sb.GetString(); + json += "\n"; + } + + return json; +} + +static void BenchmarkJSONChunking(benchmark::State& state, const std::shared_ptr& json, - ParseOptions options) { + ParseOptions options) { // NOLINT non-const reference auto chunker = Chunker::Make(options); + for (auto _ : state) { std::shared_ptr chunked, partial; ABORT_NOT_OK(chunker->Process(json, &chunked, &partial)); } + state.SetBytesProcessed(state.iterations() * json->size()); } -static void BM_ChunkJSONPrettyPrinted( +static void ChunkJSONPrettyPrinted( benchmark::State& state) { // NOLINT non-const reference const int32_t num_rows = 5000; + auto options = ParseOptions::Defaults(); options.newlines_in_values = true; - options.explicit_schema = schema({field("int", int32()), field("str", utf8())}); - std::default_random_engine engine; - std::string json; - for (int i = 0; i < num_rows; ++i) { - StringBuffer sb; - Writer writer(sb); - ABORT_NOT_OK(Generate(options.explicit_schema, engine, &writer)); - json += PrettyPrint(sb.GetString()); - json += "\n"; - } + options.explicit_schema = TestSchema(); + + auto json = TestJsonData(num_rows, /* pretty */ true); BenchmarkJSONChunking(state, std::make_shared(json), options); } -BENCHMARK(BM_ChunkJSONPrettyPrinted)->MinTime(1.0)->Unit(benchmark::kMicrosecond); - -static void BM_ChunkJSONLineDelimited( +static void ChunkJSONLineDelimited( benchmark::State& state) { // NOLINT non-const reference const int32_t num_rows = 5000; + auto options = ParseOptions::Defaults(); options.newlines_in_values = false; - options.explicit_schema = schema({field("int", int32()), field("str", utf8())}); - std::default_random_engine engine; - std::string json; - for (int i = 0; i < num_rows; ++i) { - StringBuffer sb; - Writer writer(sb); - ABORT_NOT_OK(Generate(options.explicit_schema, engine, &writer)); - json += sb.GetString(); - json += "\n"; - } + options.explicit_schema = TestSchema(); + + auto json = TestJsonData(num_rows); BenchmarkJSONChunking(state, std::make_shared(json), options); + state.SetBytesProcessed(0); } -BENCHMARK(BM_ChunkJSONLineDelimited)->MinTime(1.0)->Unit(benchmark::kMicrosecond); - static void BenchmarkJSONParsing(benchmark::State& state, // NOLINT non-const reference const std::shared_ptr& json, int32_t num_rows, ParseOptions options) { @@ -88,38 +95,24 @@ static void BenchmarkJSONParsing(benchmark::State& state, // NOLINT non-const r std::unique_ptr parser; ABORT_NOT_OK(BlockParser::Make(options, &parser)); ABORT_NOT_OK(parser->Parse(json)); - if (parser->num_rows() != num_rows) { - std::cerr << "Parsing incomplete\n"; - std::abort(); - } + std::shared_ptr parsed; ABORT_NOT_OK(parser->Finish(&parsed)); } state.SetBytesProcessed(state.iterations() * json->size()); } -static void BM_ParseJSONBlockWithSchema( +static void ParseJSONBlockWithSchema( benchmark::State& state) { // NOLINT non-const reference const int32_t num_rows = 5000; auto options = ParseOptions::Defaults(); options.unexpected_field_behavior = UnexpectedFieldBehavior::Error; - options.explicit_schema = schema({field("int", int32()), field("str", utf8())}); - std::default_random_engine engine; - std::string json; - for (int i = 0; i < num_rows; ++i) { - StringBuffer sb; - Writer writer(sb); - ABORT_NOT_OK(Generate(options.explicit_schema, engine, &writer)); - json += sb.GetString(); - json += "\n"; - } + options.explicit_schema = TestSchema(); + + auto json = TestJsonData(num_rows); BenchmarkJSONParsing(state, std::make_shared(json), num_rows, options); } -BENCHMARK(BM_ParseJSONBlockWithSchema)->MinTime(1.0)->Unit(benchmark::kMicrosecond); - -std::shared_ptr tables[2]; - static void BenchmarkJSONReading(benchmark::State& state, // NOLINT non-const reference const std::string& json, int32_t num_rows, ReadOptions read_options, ParseOptions parse_options) { @@ -133,49 +126,41 @@ static void BenchmarkJSONReading(benchmark::State& state, // NOLINT non-const r std::shared_ptr
table; ABORT_NOT_OK(reader->Read(&table)); - - if (table->num_rows() != num_rows) { - std::cerr << "Parsing incomplete\n"; - std::abort(); - } - - tables[read_options.use_threads] = table; } - state.SetBytesProcessed(state.iterations() * json.size()); - if (tables[false] && tables[true]) { - AssertTablesEqual(*tables[false], *tables[true]); - } + state.SetBytesProcessed(state.iterations() * json.size()); } -static void BM_ReadJSONBlockWithSchema( - benchmark::State& state) { // NOLINT non-const reference - const int32_t num_rows = 50000; +static void BenchmarkReadJSONBlockWithSchema( + benchmark::State& state, bool use_threads) { // NOLINT non-const reference + const int32_t num_rows = 500000; auto read_options = ReadOptions::Defaults(); - read_options.use_threads = state.range(0); + read_options.use_threads = use_threads; auto parse_options = ParseOptions::Defaults(); parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error; - parse_options.explicit_schema = schema({field("int", int32()), field("str", utf8())}); + parse_options.explicit_schema = TestSchema(); - std::default_random_engine engine; - std::string json; - for (int i = 0; i < num_rows; ++i) { - StringBuffer sb; - Writer writer(sb); - ABORT_NOT_OK(Generate(parse_options.explicit_schema, engine, &writer)); - json += sb.GetString(); - json += "\n"; - } + auto json = TestJsonData(num_rows); BenchmarkJSONReading(state, json, num_rows, read_options, parse_options); } -BENCHMARK(BM_ReadJSONBlockWithSchema) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond) - ->Arg(true) - ->Arg(false) - ->UseRealTime(); +static void ReadJSONBlockWithSchemaSingleThread( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkReadJSONBlockWithSchema(state, false); +} + +static void ReadJSONBlockWithSchemaMultiThread( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkReadJSONBlockWithSchema(state, true); +} + +BENCHMARK(ChunkJSONPrettyPrinted); +BENCHMARK(ChunkJSONLineDelimited); +BENCHMARK(ParseJSONBlockWithSchema); + +BENCHMARK(ReadJSONBlockWithSchemaSingleThread); +BENCHMARK(ReadJSONBlockWithSchemaMultiThread)->UseRealTime(); } // namespace json } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util-benchmark.cc b/cpp/src/arrow/util/bit-util-benchmark.cc index fbe786726fa..5131ceb88d1 100644 --- a/cpp/src/arrow/util/bit-util-benchmark.cc +++ b/cpp/src/arrow/util/bit-util-benchmark.cc @@ -28,10 +28,10 @@ namespace arrow { -using internal::CopyBitmap; - namespace BitUtil { +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + // A naive bitmap reader implementation, meant as a baseline against // internal::BitmapReader @@ -84,6 +84,8 @@ class NaiveBitmapWriter { int64_t position_; }; +#endif + static std::shared_ptr CreateRandomBuffer(int64_t nbytes) { std::shared_ptr buffer; ABORT_NOT_OK(AllocateBuffer(nbytes, &buffer)); @@ -119,20 +121,24 @@ static void BenchmarkBitmapReader(benchmark::State& state, int64_t nbytes) { benchmark::DoNotOptimize(total); } } - state.SetBytesProcessed(2 * int64_t(state.iterations()) * nbytes); + state.SetBytesProcessed(2LL * state.iterations() * nbytes); } +constexpr bool pattern[] = {false, false, false, true, true, true}; +static_assert( + (sizeof(pattern) / sizeof(pattern[0])) % 8 != 0, + "pattern must not be a multiple of 8, otherwise gcc can optimize with a memset"); + template static void BenchmarkBitmapWriter(benchmark::State& state, int64_t nbytes) { std::shared_ptr buffer = CreateRandomBuffer(nbytes); const int64_t num_bits = nbytes * 8; uint8_t* bitmap = buffer->mutable_data(); - const bool pattern[] = {false, false, false, true, true, true}; - while (state.KeepRunning()) { - int64_t pattern_index = 0; + for (auto _ : state) { BitmapWriterType writer(bitmap, 0, num_bits); + int64_t pattern_index = 0; for (int64_t i = 0; i < num_bits; i++) { if (pattern[pattern_index++]) { writer.Set(); @@ -147,7 +153,7 @@ static void BenchmarkBitmapWriter(benchmark::State& state, int64_t nbytes) { writer.Finish(); benchmark::ClobberMemory(); } - state.SetBytesProcessed(int64_t(state.iterations()) * nbytes); + state.SetBytesProcessed(state.iterations() * nbytes); } template @@ -156,8 +162,6 @@ static void BenchmarkGenerateBits(benchmark::State& state, int64_t nbytes) { const int64_t num_bits = nbytes * 8; uint8_t* bitmap = buffer->mutable_data(); - // pattern should be the same as in BenchmarkBitmapWriter - const bool pattern[] = {false, false, false, true, true, true}; while (state.KeepRunning()) { int64_t pattern_index = 0; @@ -171,26 +175,18 @@ static void BenchmarkGenerateBits(benchmark::State& state, int64_t nbytes) { GenerateBitsFunctorType()(bitmap, 0, num_bits, generate); benchmark::ClobberMemory(); } - state.SetBytesProcessed(2 * int64_t(state.iterations()) * nbytes); + state.SetBytesProcessed(state.iterations() * nbytes); } -static void BM_NaiveBitmapReader(benchmark::State& state) { - BenchmarkBitmapReader(state, state.range(0)); -} - -static void BM_BitmapReader(benchmark::State& state) { +static void BitmapReader(benchmark::State& state) { BenchmarkBitmapReader(state, state.range(0)); } -static void BM_NaiveBitmapWriter(benchmark::State& state) { - BenchmarkBitmapWriter(state, state.range(0)); -} - -static void BM_BitmapWriter(benchmark::State& state) { +static void BitmapWriter(benchmark::State& state) { BenchmarkBitmapWriter(state, state.range(0)); } -static void BM_FirstTimeBitmapWriter(benchmark::State& state) { +static void FirstTimeBitmapWriter(benchmark::State& state) { BenchmarkBitmapWriter(state, state.range(0)); } @@ -208,72 +204,71 @@ struct GenerateBitsUnrolledFunctor { } }; -static void BM_GenerateBits(benchmark::State& state) { +static void GenerateBits(benchmark::State& state) { BenchmarkGenerateBits(state, state.range(0)); } -static void BM_GenerateBitsUnrolled(benchmark::State& state) { +static void GenerateBitsUnrolled(benchmark::State& state) { BenchmarkGenerateBits(state, state.range(0)); } -static void BM_CopyBitmap(benchmark::State& state) { // NOLINT non-const reference - const int kBufferSize = static_cast(state.range(0)); - std::shared_ptr buffer = CreateRandomBuffer(kBufferSize); +constexpr int64_t kBufferSize = 1024 * 8; + +template +static void CopyBitmap(benchmark::State& state) { // NOLINT non-const reference + const int64_t buffer_size = state.range(0); + const int64_t bits_size = buffer_size * 8; + std::shared_ptr buffer = CreateRandomBuffer(buffer_size); - const int num_bits = kBufferSize * 8; const uint8_t* src = buffer->data(); + const int64_t offset = Offset; + const int64_t length = bits_size - offset; std::shared_ptr copy; - while (state.KeepRunning()) { - ABORT_NOT_OK(CopyBitmap(default_memory_pool(), src, state.range(1), num_bits, ©)); + auto pool = default_memory_pool(); + ABORT_NOT_OK(AllocateEmptyBitmap(pool, length, ©)); + + for (auto _ : state) { + internal::CopyBitmap(src, offset, length, copy->mutable_data(), 0, false); } - state.SetBytesProcessed(state.iterations() * kBufferSize * sizeof(int8_t)); + + state.SetBytesProcessed(state.iterations() * buffer_size); } -BENCHMARK(BM_CopyBitmap) - ->Args({100000, 0}) - ->Args({1000000, 0}) - ->Args({100000, 4}) - ->Args({1000000, 4}) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_NaiveBitmapReader) - ->Args({1000000}) - ->MinTime(5.0) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_BitmapReader)->Args({1000000})->MinTime(5.0)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_NaiveBitmapWriter) - ->Args({100000}) - ->Repetitions(2) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_BitmapWriter) - ->Args({100000}) - ->Repetitions(2) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_FirstTimeBitmapWriter) - ->Args({100000}) - ->Repetitions(2) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_GenerateBits) - ->Args({100000}) - ->Repetitions(2) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_GenerateBitsUnrolled) - ->Args({100000}) - ->Repetitions(2) - ->MinTime(1.0) - ->Unit(benchmark::kMicrosecond); +static void CopyBitmapWithoutOffset( + benchmark::State& state) { // NOLINT non-const reference + CopyBitmap<0>(state); +} + +// Trigger the slow path where the buffer is not byte aligned. +static void CopyBitmapWithOffset(benchmark::State& state) { // NOLINT non-const reference + CopyBitmap<4>(state); +} + +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + +static void ReferenceNaiveBitmapReader(benchmark::State& state) { + BenchmarkBitmapReader(state, state.range(0)); +} + +static void ReferenceNaiveBitmapWriter(benchmark::State& state) { + BenchmarkBitmapWriter(state, state.range(0)); +} + +BENCHMARK(ReferenceNaiveBitmapWriter)->Arg(kBufferSize); +BENCHMARK(ReferenceNaiveBitmapReader)->Arg(kBufferSize); + +#endif + +BENCHMARK(CopyBitmapWithoutOffset)->Arg(kBufferSize); +BENCHMARK(CopyBitmapWithOffset)->Arg(kBufferSize); + +BENCHMARK(BitmapReader)->Arg(kBufferSize); +BENCHMARK(BitmapWriter)->Arg(kBufferSize); + +BENCHMARK(FirstTimeBitmapWriter)->Arg(kBufferSize); +BENCHMARK(GenerateBits)->Arg(kBufferSize); +BENCHMARK(GenerateBitsUnrolled)->Arg(kBufferSize); } // namespace BitUtil } // namespace arrow diff --git a/cpp/src/arrow/util/compression-benchmark.cc b/cpp/src/arrow/util/compression-benchmark.cc index e71d80ada49..28bc1255c4b 100644 --- a/cpp/src/arrow/util/compression-benchmark.cc +++ b/cpp/src/arrow/util/compression-benchmark.cc @@ -29,6 +29,8 @@ namespace arrow { namespace util { +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + std::vector MakeCompressibleData(int data_size) { // XXX This isn't a real-world corpus so doesn't really represent the // comparative qualities of the algorithms @@ -111,9 +113,9 @@ int64_t StreamingCompress(Codec* codec, const std::vector& data, return compressed_size; } -static void BM_StreamingCompression( - Compression::type compression, const std::vector& data, - benchmark::State& state) { // NOLINT non-const reference +static void StreamingCompression(Compression::type compression, + const std::vector& data, + benchmark::State& state) { // NOLINT non-const reference std::unique_ptr codec; ABORT_NOT_OK(Codec::Create(compression, &codec)); @@ -126,14 +128,14 @@ static void BM_StreamingCompression( } template -static void BM_StreamingCompression( +static void ReferenceStreamingCompression( benchmark::State& state) { // NOLINT non-const reference auto data = MakeCompressibleData(8 * 1024 * 1024); // 8 MB - BM_StreamingCompression(COMPRESSION, data, state); + StreamingCompression(COMPRESSION, data, state); } -static void BM_StreamingDecompression( +static void StreamingDecompression( Compression::type compression, const std::vector& data, benchmark::State& state) { // NOLINT non-const reference std::unique_ptr codec; @@ -173,38 +175,24 @@ static void BM_StreamingDecompression( } template -static void BM_StreamingDecompression( +static void ReferenceStreamingDecompression( benchmark::State& state) { // NOLINT non-const reference auto data = MakeCompressibleData(8 * 1024 * 1024); // 8 MB - BM_StreamingDecompression(COMPRESSION, data, state); + StreamingDecompression(COMPRESSION, data, state); } -BENCHMARK_TEMPLATE(BM_StreamingCompression, Compression::GZIP) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); -BENCHMARK_TEMPLATE(BM_StreamingCompression, Compression::BROTLI) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); -BENCHMARK_TEMPLATE(BM_StreamingCompression, Compression::ZSTD) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); -BENCHMARK_TEMPLATE(BM_StreamingCompression, Compression::LZ4) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); - -BENCHMARK_TEMPLATE(BM_StreamingDecompression, Compression::GZIP) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); -BENCHMARK_TEMPLATE(BM_StreamingDecompression, Compression::BROTLI) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); -BENCHMARK_TEMPLATE(BM_StreamingDecompression, Compression::ZSTD) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); -BENCHMARK_TEMPLATE(BM_StreamingDecompression, Compression::LZ4) - ->Unit(benchmark::kMillisecond) - ->Repetitions(1); +BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::GZIP); +BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::BROTLI); +BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::ZSTD); +BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::LZ4); + +BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::GZIP); +BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::BROTLI); +BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::ZSTD); +BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::LZ4); + +#endif } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/decimal-benchmark.cc b/cpp/src/arrow/util/decimal-benchmark.cc index 3129536cf0a..e5b1d2a3c4e 100644 --- a/cpp/src/arrow/util/decimal-benchmark.cc +++ b/cpp/src/arrow/util/decimal-benchmark.cc @@ -26,7 +26,7 @@ namespace arrow { namespace Decimal { -static void BM_FromString(benchmark::State& state) { // NOLINT non-const reference +static void FromString(benchmark::State& state) { // NOLINT non-const reference std::vector values = {"0", "1.23", "12.345e6", "-12.345e-6"}; while (state.KeepRunning()) { @@ -39,7 +39,7 @@ static void BM_FromString(benchmark::State& state) { // NOLINT non-const refere state.SetItemsProcessed(state.iterations() * values.size()); } -BENCHMARK(BM_FromString)->Repetitions(3)->Unit(benchmark::kMicrosecond); +BENCHMARK(FromString); } // namespace Decimal } // namespace arrow diff --git a/cpp/src/arrow/util/hashing-benchmark.cc b/cpp/src/arrow/util/hashing-benchmark.cc index 2049c4e64e1..c7051d1a351 100644 --- a/cpp/src/arrow/util/hashing-benchmark.cc +++ b/cpp/src/arrow/util/hashing-benchmark.cc @@ -62,7 +62,7 @@ static std::vector MakeStrings(int32_t n_values, int32_t min_length return values; } -static void BM_HashIntegers(benchmark::State& state) { // NOLINT non-const reference +static void HashIntegers(benchmark::State& state) { // NOLINT non-const reference const std::vector values = MakeIntegers(10000); while (state.KeepRunning()) { @@ -96,17 +96,17 @@ static void BenchmarkStringHashing(benchmark::State& state, // NOLINT non-const state.SetItemsProcessed(2 * state.iterations() * values.size()); } -static void BM_HashSmallStrings(benchmark::State& state) { // NOLINT non-const reference +static void HashSmallStrings(benchmark::State& state) { // NOLINT non-const reference const std::vector values = MakeStrings(10000, 2, 20); BenchmarkStringHashing(state, values); } -static void BM_HashMediumStrings(benchmark::State& state) { // NOLINT non-const reference +static void HashMediumStrings(benchmark::State& state) { // NOLINT non-const reference const std::vector values = MakeStrings(10000, 20, 120); BenchmarkStringHashing(state, values); } -static void BM_HashLargeStrings(benchmark::State& state) { // NOLINT non-const reference +static void HashLargeStrings(benchmark::State& state) { // NOLINT non-const reference const std::vector values = MakeStrings(1000, 120, 2000); BenchmarkStringHashing(state, values); } @@ -114,15 +114,10 @@ static void BM_HashLargeStrings(benchmark::State& state) { // NOLINT non-const // ---------------------------------------------------------------------- // Benchmark declarations -static constexpr int32_t kRepetitions = 1; - -BENCHMARK(BM_HashIntegers)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_HashSmallStrings)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_HashMediumStrings)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_HashLargeStrings)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); +BENCHMARK(HashIntegers); +BENCHMARK(HashSmallStrings); +BENCHMARK(HashMediumStrings); +BENCHMARK(HashLargeStrings); } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util-benchmark.cc b/cpp/src/arrow/util/int-util-benchmark.cc index 3feb2eeafeb..1b306ed946b 100644 --- a/cpp/src/arrow/util/int-util-benchmark.cc +++ b/cpp/src/arrow/util/int-util-benchmark.cc @@ -49,7 +49,7 @@ std::vector GetValidBytes(int n_values) { return valid_bytes; } -static void BM_DetectUIntWidthNoNulls( +static void DetectUIntWidthNoNulls( benchmark::State& state) { // NOLINT non-const reference const auto values = GetUIntSequence(0x12345); @@ -60,8 +60,7 @@ static void BM_DetectUIntWidthNoNulls( state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); } -static void BM_DetectUIntWidthNulls( - benchmark::State& state) { // NOLINT non-const reference +static void DetectUIntWidthNulls(benchmark::State& state) { // NOLINT non-const reference const auto values = GetUIntSequence(0x12345); const auto valid_bytes = GetValidBytes(0x12345); @@ -73,7 +72,7 @@ static void BM_DetectUIntWidthNulls( state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); } -static void BM_DetectIntWidthNoNulls( +static void DetectIntWidthNoNulls( benchmark::State& state) { // NOLINT non-const reference const auto values = GetIntSequence(0x12345, -0x1234); @@ -84,8 +83,7 @@ static void BM_DetectIntWidthNoNulls( state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); } -static void BM_DetectIntWidthNulls( - benchmark::State& state) { // NOLINT non-const reference +static void DetectIntWidthNulls(benchmark::State& state) { // NOLINT non-const reference const auto values = GetIntSequence(0x12345, -0x1234); const auto valid_bytes = GetValidBytes(0x12345); @@ -97,13 +95,10 @@ static void BM_DetectIntWidthNulls( state.SetBytesProcessed(state.iterations() * values.size() * sizeof(uint64_t)); } -BENCHMARK(BM_DetectUIntWidthNoNulls)->MinTime(1.0)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_DetectUIntWidthNulls)->MinTime(1.0)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_DetectIntWidthNoNulls)->MinTime(1.0)->Unit(benchmark::kMicrosecond); - -BENCHMARK(BM_DetectIntWidthNulls)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DetectUIntWidthNoNulls); +BENCHMARK(DetectUIntWidthNulls); +BENCHMARK(DetectIntWidthNoNulls); +BENCHMARK(DetectIntWidthNulls); } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/lazy-benchmark.cc b/cpp/src/arrow/util/lazy-benchmark.cc index 02c7de5c22d..ec39f1f873c 100644 --- a/cpp/src/arrow/util/lazy-benchmark.cc +++ b/cpp/src/arrow/util/lazy-benchmark.cc @@ -27,6 +27,8 @@ namespace arrow { +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + static constexpr int64_t kSize = 100000000; template @@ -37,7 +39,7 @@ std::vector generate_junk(int64_t size) { } // Baseline -void BM_for_loop(benchmark::State& state) { +void for_loop(benchmark::State& state) { auto source = generate_junk(kSize); std::vector target(kSize); @@ -46,10 +48,10 @@ void BM_for_loop(benchmark::State& state) { } } -BENCHMARK(BM_for_loop)->Repetitions(3)->Unit(benchmark::kMillisecond); +BENCHMARK(for_loop); // For comparison: pure copy without any changes -void BM_std_copy(benchmark::State& state) { +void std_copy(benchmark::State& state) { auto source = generate_junk(kSize); std::vector target(kSize); @@ -58,10 +60,10 @@ void BM_std_copy(benchmark::State& state) { } } -BENCHMARK(BM_std_copy)->Repetitions(3)->Unit(benchmark::kMillisecond); +BENCHMARK(std_copy); // For comparison: pure copy with type convesion. -void BM_std_copy_converting(benchmark::State& state) { +void std_copy_converting(benchmark::State& state) { auto source = generate_junk(kSize); // bigger type to avoid warnings std::vector target(kSize); @@ -71,10 +73,10 @@ void BM_std_copy_converting(benchmark::State& state) { } } -BENCHMARK(BM_std_copy_converting)->Repetitions(3)->Unit(benchmark::kMillisecond); +BENCHMARK(std_copy_converting); // std::copy with a lazy range as a source -void BM_lazy_copy(benchmark::State& state) { +void lazy_copy(benchmark::State& state) { auto source = generate_junk(kSize); std::vector target(kSize); auto lazy_range = internal::MakeLazyRange( @@ -85,11 +87,11 @@ void BM_lazy_copy(benchmark::State& state) { } } -BENCHMARK(BM_lazy_copy)->Repetitions(3)->Unit(benchmark::kMillisecond); +BENCHMARK(lazy_copy); // std::copy with a lazy range which does static cast. // Should be the same performance as std::copy with differtly typed iterators -void BM_lazy_copy_converting(benchmark::State& state) { +void lazy_copy_converting(benchmark::State& state) { auto source = generate_junk(kSize); std::vector target(kSize); auto lazy_range = internal::MakeLazyRange( @@ -101,10 +103,10 @@ void BM_lazy_copy_converting(benchmark::State& state) { } } -BENCHMARK(BM_lazy_copy_converting)->Repetitions(3)->Unit(benchmark::kMillisecond); +BENCHMARK(lazy_copy_converting); // For loop with a post-increment of a lazy operator -void BM_lazy_postinc(benchmark::State& state) { +void lazy_postinc(benchmark::State& state) { auto source = generate_junk(kSize); std::vector target(kSize); auto lazy_range = internal::MakeLazyRange( @@ -119,6 +121,8 @@ void BM_lazy_postinc(benchmark::State& state) { } } -BENCHMARK(BM_lazy_postinc)->Repetitions(3)->Unit(benchmark::kMillisecond); +BENCHMARK(lazy_postinc); + +#endif // ARROW_WITH_BENCHMARKS_REFERENCE } // namespace arrow diff --git a/cpp/src/arrow/util/machine-benchmark.cc b/cpp/src/arrow/util/machine-benchmark.cc index ad3f413e7f0..67397444bd9 100644 --- a/cpp/src/arrow/util/machine-benchmark.cc +++ b/cpp/src/arrow/util/machine-benchmark.cc @@ -28,6 +28,8 @@ namespace arrow { +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + // Generate a vector of indices such as following the indices describes // a path over the whole vector. The path is randomized to avoid triggering // automatic prefetching in the CPU. @@ -51,7 +53,7 @@ std::vector RandomPath(int32_t size) { } // Cache / main memory latency, depending on the working set size -static void BM_memory_latency(benchmark::State& state) { +static void memory_latency(benchmark::State& state) { const auto niters = static_cast(state.range(0)); const std::vector path = RandomPath(niters / 4); @@ -65,6 +67,8 @@ static void BM_memory_latency(benchmark::State& state) { state.SetItemsProcessed(state.iterations()); } -BENCHMARK(BM_memory_latency)->RangeMultiplier(2)->Range(2 << 10, 2 << 24); +BENCHMARK(memory_latency)->Repetitions(1)->RangeMultiplier(2)->Range(2 << 10, 2 << 24); + +#endif // ARROW_WITH_BENCHMARKS_REFERENCE } // namespace arrow diff --git a/cpp/src/arrow/util/number-parsing-benchmark.cc b/cpp/src/arrow/util/number-parsing-benchmark.cc index d94011e35f0..e13ece90adc 100644 --- a/cpp/src/arrow/util/number-parsing-benchmark.cc +++ b/cpp/src/arrow/util/number-parsing-benchmark.cc @@ -69,7 +69,7 @@ static std::vector MakeTimestampStrings(int32_t num_items) { } template -static void BM_IntegerParsing(benchmark::State& state) { // NOLINT non-const reference +static void IntegerParsing(benchmark::State& state) { // NOLINT non-const reference auto strings = MakeIntStrings(1000); StringConverter converter; @@ -89,7 +89,7 @@ static void BM_IntegerParsing(benchmark::State& state) { // NOLINT non-const re } template -static void BM_FloatParsing(benchmark::State& state) { // NOLINT non-const reference +static void FloatParsing(benchmark::State& state) { // NOLINT non-const reference auto strings = MakeFloatStrings(1000); StringConverter converter; @@ -109,7 +109,7 @@ static void BM_FloatParsing(benchmark::State& state) { // NOLINT non-const refe } template -static void BM_TimestampParsing(benchmark::State& state) { // NOLINT non-const reference +static void TimestampParsing(benchmark::State& state) { // NOLINT non-const reference using c_type = TimestampType::c_type; auto strings = MakeTimestampStrings(1000); @@ -131,22 +131,22 @@ static void BM_TimestampParsing(benchmark::State& state) { // NOLINT non-const state.SetItemsProcessed(state.iterations() * strings.size()); } -BENCHMARK_TEMPLATE(BM_IntegerParsing, Int8Type); -BENCHMARK_TEMPLATE(BM_IntegerParsing, Int16Type); -BENCHMARK_TEMPLATE(BM_IntegerParsing, Int32Type); -BENCHMARK_TEMPLATE(BM_IntegerParsing, Int64Type); -BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt8Type); -BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt16Type); -BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt32Type); -BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt64Type); - -BENCHMARK_TEMPLATE(BM_FloatParsing, FloatType); -BENCHMARK_TEMPLATE(BM_FloatParsing, DoubleType); - -BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::SECOND); -BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MILLI); -BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MICRO); -BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::NANO); +BENCHMARK_TEMPLATE(IntegerParsing, Int8Type); +BENCHMARK_TEMPLATE(IntegerParsing, Int16Type); +BENCHMARK_TEMPLATE(IntegerParsing, Int32Type); +BENCHMARK_TEMPLATE(IntegerParsing, Int64Type); +BENCHMARK_TEMPLATE(IntegerParsing, UInt8Type); +BENCHMARK_TEMPLATE(IntegerParsing, UInt16Type); +BENCHMARK_TEMPLATE(IntegerParsing, UInt32Type); +BENCHMARK_TEMPLATE(IntegerParsing, UInt64Type); + +BENCHMARK_TEMPLATE(FloatParsing, FloatType); +BENCHMARK_TEMPLATE(FloatParsing, DoubleType); + +BENCHMARK_TEMPLATE(TimestampParsing, TimeUnit::SECOND); +BENCHMARK_TEMPLATE(TimestampParsing, TimeUnit::MILLI); +BENCHMARK_TEMPLATE(TimestampParsing, TimeUnit::MICRO); +BENCHMARK_TEMPLATE(TimestampParsing, TimeUnit::NANO); } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/thread-pool-benchmark.cc b/cpp/src/arrow/util/thread-pool-benchmark.cc index f0f23622deb..b10a5d194dc 100644 --- a/cpp/src/arrow/util/thread-pool-benchmark.cc +++ b/cpp/src/arrow/util/thread-pool-benchmark.cc @@ -73,22 +73,8 @@ struct Task { Workload workload_; }; -// This benchmark simply provides a baseline indicating the raw cost of our workload -// depending on the workload size. Number of items / second in this (serial) -// benchmark can be compared to the numbers obtained in BM_ThreadPoolSpawn. -static void BM_WorkloadCost(benchmark::State& state) { - const auto workload_size = static_cast(state.range(0)); - - Workload workload(workload_size); - for (auto _ : state) { - workload(); - } - - state.SetItemsProcessed(state.iterations()); -} - // Benchmark ThreadPool::Spawn -static void BM_ThreadPoolSpawn(benchmark::State& state) { +static void ThreadPoolSpawn(benchmark::State& state) { const auto nthreads = static_cast(state.range(0)); const auto workload_size = static_cast(state.range(1)); @@ -118,7 +104,7 @@ static void BM_ThreadPoolSpawn(benchmark::State& state) { } // Benchmark serial TaskGroup -static void BM_SerialTaskGroup(benchmark::State& state) { +static void SerialTaskGroup(benchmark::State& state) { const auto workload_size = static_cast(state.range(0)); Task task(workload_size); @@ -137,7 +123,7 @@ static void BM_SerialTaskGroup(benchmark::State& state) { } // Benchmark threaded TaskGroup -static void BM_ThreadedTaskGroup(benchmark::State& state) { +static void ThreadedTaskGroup(benchmark::State& state) { const auto nthreads = static_cast(state.range(0)); const auto workload_size = static_cast(state.range(1)); @@ -168,6 +154,7 @@ static void WorkloadCost_Customize(benchmark::internal::Benchmark* b) { b->Args({w}); } b->ArgNames({"task_cost"}); + b->UseRealTime(); } static void ThreadPoolSpawn_Customize(benchmark::internal::Benchmark* b) { @@ -177,26 +164,32 @@ static void ThreadPoolSpawn_Customize(benchmark::internal::Benchmark* b) { } } b->ArgNames({"threads", "task_cost"}); + b->UseRealTime(); } -static const int kRepetitions = 1; +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE -BENCHMARK(BM_WorkloadCost)->Repetitions(kRepetitions)->Apply(WorkloadCost_Customize); +// This benchmark simply provides a baseline indicating the raw cost of our workload +// depending on the workload size. Number of items / second in this (serial) +// benchmark can be compared to the numbers obtained in ThreadPoolSpawn. +static void WorkloadCost(benchmark::State& state) { + const auto workload_size = static_cast(state.range(0)); + + Workload workload(workload_size); + for (auto _ : state) { + workload(); + } + + state.SetItemsProcessed(state.iterations()); +} -BENCHMARK(BM_ThreadPoolSpawn) - ->UseRealTime() - ->Repetitions(kRepetitions) - ->Apply(ThreadPoolSpawn_Customize); +BENCHMARK(ReferenceWorkloadCost)->Apply(WorkloadCost_Customize); -BENCHMARK(BM_SerialTaskGroup) - ->UseRealTime() - ->Repetitions(kRepetitions) - ->Apply(WorkloadCost_Customize); +#endif -BENCHMARK(BM_ThreadedTaskGroup) - ->UseRealTime() - ->Repetitions(kRepetitions) - ->Apply(ThreadPoolSpawn_Customize); +BENCHMARK(SerialTaskGroup)->Apply(WorkloadCost_Customize); +BENCHMARK(ThreadPoolSpawn)->Apply(ThreadPoolSpawn_Customize); +BENCHMARK(ThreadedTaskGroup)->Apply(ThreadPoolSpawn_Customize); } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/trie-benchmark.cc b/cpp/src/arrow/util/trie-benchmark.cc index 8aab8b8c62c..868accc3744 100644 --- a/cpp/src/arrow/util/trie-benchmark.cc +++ b/cpp/src/arrow/util/trie-benchmark.cc @@ -28,6 +28,64 @@ namespace arrow { namespace internal { +std::vector AllNulls() { + return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", + "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; +} + +Trie MakeNullsTrie() { + auto nulls = AllNulls(); + + TrieBuilder builder; + for (const auto& str : AllNulls()) { + ABORT_NOT_OK(builder.Append(str)); + } + return builder.Finish(); +} + +std::vector Expand(const std::vector& base, size_t n) { + std::vector result; + result.reserve(n); + + while (true) { + for (const auto& v : base) { + result.push_back(v); + if (result.size() == n) { + return result; + } + } + } +} + +static void BenchmarkTrieLookups(benchmark::State& state, // NOLINT non-const reference + const std::vector& strings) { + Trie trie = MakeNullsTrie(); + int32_t total = 0; + + auto lookups = Expand(strings, 100); + + for (auto _ : state) { + for (const auto& s : lookups) { + total += trie.Find(s); + } + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations() * lookups.size()); +} + +static void TrieLookupFound(benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); +} + +static void TrieLookupNotFound(benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"None", "1.0", "", "abc"}); +} + +BENCHMARK(TrieLookupFound); +BENCHMARK(TrieLookupNotFound); + +#ifdef ARROW_WITH_BENCHMARKS_REFERENCE + static inline bool InlinedNullLookup(util::string_view s) { // An inlined version of trie lookup for a specific set of strings // (see AllNulls()) @@ -130,51 +188,6 @@ static inline bool InlinedNullLookup(util::string_view s) { } } -std::vector AllNulls() { - return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", - "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; -} - -Trie MakeNullsTrie() { - auto nulls = AllNulls(); - - TrieBuilder builder; - for (const auto& str : AllNulls()) { - ABORT_NOT_OK(builder.Append(str)); - } - return builder.Finish(); -} - -std::vector Expand(const std::vector& base, size_t n) { - std::vector result; - result.reserve(n); - - while (true) { - for (const auto& v : base) { - result.push_back(v); - if (result.size() == n) { - return result; - } - } - } -} - -static void BenchmarkTrieLookups(benchmark::State& state, // NOLINT non-const reference - const std::vector& strings) { - Trie trie = MakeNullsTrie(); - int32_t total = 0; - - auto lookups = Expand(strings, 100); - - for (auto _ : state) { - for (const auto& s : lookups) { - total += trie.Find(s); - } - } - benchmark::DoNotOptimize(total); - state.SetItemsProcessed(state.iterations() * lookups.size()); -} - static void BenchmarkInlinedTrieLookups( benchmark::State& state, // NOLINT non-const reference const std::vector& strings) { @@ -190,32 +203,20 @@ static void BenchmarkInlinedTrieLookups( benchmark::DoNotOptimize(total); state.SetItemsProcessed(state.iterations() * lookups.size()); } - -static void BM_TrieLookupFound(benchmark::State& state) { // NOLINT non-const reference - BenchmarkTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); -} - -static void BM_TrieLookupNotFound( - benchmark::State& state) { // NOLINT non-const reference - BenchmarkTrieLookups(state, {"None", "1.0", "", "abc"}); -} - -static void BM_InlinedTrieLookupFound( +static void InlinedTrieLookupFound( benchmark::State& state) { // NOLINT non-const reference BenchmarkInlinedTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); } -static void BM_InlinedTrieLookupNotFound( +static void InlinedTrieLookupNotFound( benchmark::State& state) { // NOLINT non-const reference BenchmarkInlinedTrieLookups(state, {"None", "1.0", "", "abc"}); } -static const int kRepetitions = 2; +BENCHMARK(InlinedTrieLookupFound); +BENCHMARK(InlinedTrieLookupNotFound); -BENCHMARK(BM_TrieLookupFound)->Repetitions(kRepetitions); -BENCHMARK(BM_TrieLookupNotFound)->Repetitions(kRepetitions); -BENCHMARK(BM_InlinedTrieLookupFound)->Repetitions(kRepetitions); -BENCHMARK(BM_InlinedTrieLookupNotFound)->Repetitions(kRepetitions); +#endif } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/utf8-util-benchmark.cc b/cpp/src/arrow/util/utf8-util-benchmark.cc index 7e03a4e5324..51f7b2ae050 100644 --- a/cpp/src/arrow/util/utf8-util-benchmark.cc +++ b/cpp/src/arrow/util/utf8-util-benchmark.cc @@ -70,58 +70,53 @@ static void BenchmarkUTF8Validation( state.SetBytesProcessed(state.iterations() * s.size()); } -static void BM_ValidateTinyAscii(benchmark::State& state) { // NOLINT non-const reference +static void ValidateTinyAscii(benchmark::State& state) { // NOLINT non-const reference BenchmarkUTF8Validation(state, tiny_valid_ascii, true); } -static void BM_ValidateTinyNonAscii( - benchmark::State& state) { // NOLINT non-const reference +static void ValidateTinyNonAscii(benchmark::State& state) { // NOLINT non-const reference BenchmarkUTF8Validation(state, tiny_valid_non_ascii, true); } -static void BM_ValidateSmallAscii( - benchmark::State& state) { // NOLINT non-const reference +static void ValidateSmallAscii(benchmark::State& state) { // NOLINT non-const reference BenchmarkUTF8Validation(state, valid_ascii, true); } -static void BM_ValidateSmallAlmostAscii( +static void ValidateSmallAlmostAscii( benchmark::State& state) { // NOLINT non-const reference BenchmarkUTF8Validation(state, valid_almost_ascii, true); } -static void BM_ValidateSmallNonAscii( +static void ValidateSmallNonAscii( benchmark::State& state) { // NOLINT non-const reference BenchmarkUTF8Validation(state, valid_non_ascii, true); } -static void BM_ValidateLargeAscii( - benchmark::State& state) { // NOLINT non-const reference +static void ValidateLargeAscii(benchmark::State& state) { // NOLINT non-const reference auto s = MakeLargeString(valid_ascii, 100000); BenchmarkUTF8Validation(state, s, true); } -static void BM_ValidateLargeAlmostAscii( +static void ValidateLargeAlmostAscii( benchmark::State& state) { // NOLINT non-const reference auto s = MakeLargeString(valid_almost_ascii, 100000); BenchmarkUTF8Validation(state, s, true); } -static void BM_ValidateLargeNonAscii( +static void ValidateLargeNonAscii( benchmark::State& state) { // NOLINT non-const reference auto s = MakeLargeString(valid_non_ascii, 100000); BenchmarkUTF8Validation(state, s, true); } -static const int kRepetitions = 1; - -BENCHMARK(BM_ValidateTinyAscii)->Repetitions(kRepetitions); -BENCHMARK(BM_ValidateTinyNonAscii)->Repetitions(kRepetitions); -BENCHMARK(BM_ValidateSmallAscii)->Repetitions(kRepetitions); -BENCHMARK(BM_ValidateSmallAlmostAscii)->Repetitions(kRepetitions); -BENCHMARK(BM_ValidateSmallNonAscii)->Repetitions(kRepetitions); -BENCHMARK(BM_ValidateLargeAscii)->Repetitions(kRepetitions); -BENCHMARK(BM_ValidateLargeAlmostAscii)->Repetitions(kRepetitions); -BENCHMARK(BM_ValidateLargeNonAscii)->Repetitions(kRepetitions); +BENCHMARK(ValidateTinyAscii); +BENCHMARK(ValidateTinyNonAscii); +BENCHMARK(ValidateSmallAscii); +BENCHMARK(ValidateSmallAlmostAscii); +BENCHMARK(ValidateSmallNonAscii); +BENCHMARK(ValidateLargeAscii); +BENCHMARK(ValidateLargeAlmostAscii); +BENCHMARK(ValidateLargeNonAscii); } // namespace util } // namespace arrow diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index bd2793eb4e8..49e6ad1b05d 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -30,6 +30,9 @@ def partition(pred, iterable): return list(filter(pred, t1)), list(filterfalse(pred, t2)) +DEFAULT_REPETITIONS = 10 + + class GoogleBenchmarkCommand(Command): """ Run a google benchmark binary. @@ -49,9 +52,9 @@ def list_benchmarks(self): stderr=subprocess.PIPE) return str.splitlines(result.stdout.decode("utf-8")) - def results(self): + def results(self, repetitions=DEFAULT_REPETITIONS): with NamedTemporaryFile() as out: - argv = ["--benchmark_repetitions=20", + argv = [f"--benchmark_repetitions={repetitions}", f"--benchmark_out={out.name}", "--benchmark_out_format=json"] @@ -87,13 +90,14 @@ class GoogleBenchmarkObservation: """ def __init__(self, name, real_time, cpu_time, time_unit, size=None, - bytes_per_second=None, **kwargs): + bytes_per_second=None, items_per_second=None, **kwargs): self._name = name self.real_time = real_time self.cpu_time = cpu_time self.time_unit = time_unit self.size = size self.bytes_per_second = bytes_per_second + self.items_per_second = items_per_second @property def is_agg(self): @@ -118,11 +122,16 @@ def time(self): @property def value(self): """ Return the benchmark value.""" - return self.bytes_per_second if self.size else self.time + return self.bytes_per_second or self.items_per_second or self.time @property def unit(self): - return "bytes_per_second" if self.size else self.time_unit + if self.bytes_per_second: + return "bytes_per_second" + elif self.items_per_second: + return "items_per_second" + else: + return self.time_unit def __repr__(self): return f"{self.value}" @@ -147,9 +156,7 @@ def __init__(self, name, runs): _, runs = partition(lambda b: b.is_agg, runs) self.runs = sorted(runs, key=lambda b: b.value) unit = self.runs[0].unit - # If `size` is found in the json dict, then the benchmark is reported - # in bytes per second - less_is_better = self.runs[0].size is None + less_is_better = not unit.endswith("per_second") values = [b.value for b in self.runs] super().__init__(name, unit, less_is_better, values) diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index dbbb3f5713c..5cee7782c8b 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -85,6 +85,12 @@ def __init__(self, suites, **kwargs): self._suites = suites super().__init__(**kwargs) + @property + def list_benchmarks(self): + for suite in self._suites: + for benchmark in suite.benchmarks: + yield f"{suite.name}.{benchmark.name}" + @property def suites(self): suite_fn = regex_filter(self.suite_filter) @@ -146,6 +152,13 @@ def suite(self, name, suite_bin): benchmarks = GoogleBenchmark.from_json(results.get("benchmarks")) return BenchmarkSuite(name, benchmarks) + @property + def list_benchmarks(self): + for suite_name, suite_bin in self.suites_binaries.items(): + suite_cmd = GoogleBenchmarkCommand(suite_bin) + for benchmark_name in suite_cmd.list_benchmarks(): + yield f"{suite_name}.{benchmark_name}" + @property def suites(self): """ Returns all suite for a runner. """ diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 0178d58a03f..e2dd9ea1c9b 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -37,10 +37,12 @@ @click.group() @click.option("--debug", type=bool, is_flag=True, default=False, help="Increase logging with debugging output.") +@click.option("--pdb", type=bool, is_flag=True, default=False, + help="Invoke pdb on uncaught exception.") @click.option("-q", "--quiet", type=bool, is_flag=True, default=False, help="Silence executed commands.") @click.pass_context -def archery(ctx, debug, quiet): +def archery(ctx, debug, pdb, quiet): """ Apache Arrow developer utilities. See sub-commands help with `archery --help`. @@ -53,6 +55,10 @@ def archery(ctx, debug, quiet): if debug: logger.setLevel(logging.DEBUG) + if pdb: + import pdb + sys.excepthook = lambda t, v, e: pdb.pm() + def validate_arrow_sources(ctx, param, src): """ Ensure a directory contains Arrow cpp sources. """ @@ -167,6 +173,39 @@ def benchmark(ctx): pass +@benchmark.command(name="list", short_help="List benchmark suite") +@click.option("--src", metavar="", show_default=True, + default=ArrowSources.find(), + callback=validate_arrow_sources, + help="Specify Arrow source directory") +@click.option("--preserve", type=bool, default=False, show_default=True, + is_flag=True, help="Preserve workspace for investigation.") +@click.option("--output", metavar="", + type=click.File("w", encoding="utf8"), default="-", + help="Capture output result into file.") +@click.option("--cmake-extras", type=str, multiple=True, + help="Extra flags/options to pass to cmake invocation. " + "Can be stacked") +@click.argument("rev_or_path", metavar="[]", default="WORKSPACE", + required=False) +@click.pass_context +def benchmark_list(ctx, src, preserve, output, cmake_extras, rev_or_path): + """ List benchmark suite. + """ + with tmpdir(preserve) as root: + logger.debug(f"Running benchmark {rev_or_path}") + + conf = CppConfiguration( + build_type="release", with_tests=True, with_benchmarks=True, + with_python=False, cmake_extras=cmake_extras) + + runner_base = BenchmarkRunner.from_rev_or_path( + src, root, rev_or_path, conf) + + for b in runner_base.list_benchmarks: + click.echo(b, file=output) + + @benchmark.command(name="run", short_help="Run benchmark suite") @click.option("--src", metavar="", show_default=True, default=ArrowSources.find(), @@ -175,7 +214,7 @@ def benchmark(ctx): @click.option("--suite-filter", metavar="", show_default=True, type=str, default=None, help="Regex filtering benchmark suites.") @click.option("--benchmark-filter", metavar="", show_default=True, - type=str, default=DEFAULT_BENCHMARK_FILTER, + type=str, default=None, help="Regex filtering benchmark suites.") @click.option("--preserve", type=bool, default=False, show_default=True, is_flag=True, help="Preserve workspace for investigation.") @@ -185,11 +224,11 @@ def benchmark(ctx): @click.option("--cmake-extras", type=str, multiple=True, help="Extra flags/options to pass to cmake invocation. " "Can be stacked") -@click.argument("baseline", metavar="[]]", default="master", +@click.argument("rev_or_path", metavar="[]", default="WORKSPACE", required=False) @click.pass_context def benchmark_run(ctx, src, preserve, suite_filter, benchmark_filter, - output, cmake_extras, baseline): + output, cmake_extras, rev_or_path): """ Run benchmark suite. This command will run the benchmark suite for a single build. This is @@ -198,7 +237,6 @@ def benchmark_run(ctx, src, preserve, suite_filter, benchmark_filter, The caller can optionally specify a target which is either a git revision (commit, tag, special values like HEAD) or a cmake build directory. - When a commit is referenced, a local clone of the arrow sources (specified via --src) is performed and the proper branch is created. This is done in a temporary directory which can be left intact with the `---preserve` flag. @@ -224,14 +262,14 @@ def benchmark_run(ctx, src, preserve, suite_filter, benchmark_filter, archery benchmark run --output=run.json """ with tmpdir(preserve) as root: - logger.debug(f"Running benchmark {baseline}") + logger.debug(f"Running benchmark {rev_or_path}") conf = CppConfiguration( build_type="release", with_tests=True, with_benchmarks=True, with_python=False, cmake_extras=cmake_extras) runner_base = BenchmarkRunner.from_rev_or_path( - src, root, baseline, conf, + src, root, rev_or_path, conf, suite_filter=suite_filter, benchmark_filter=benchmark_filter) json.dump(runner_base, output, cls=JsonEncoder) @@ -357,7 +395,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, for comparator in runner_comp.comparisons: regressions += comparator.regression json.dump(comparator, output, cls=JsonEncoder) - output.write('\n') + output.write("\n") sys.exit(regressions)