Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 104 additions & 23 deletions cpp/src/parquet/arrow/reader_writer_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "benchmark/benchmark.h"

#include <iostream>
#include <random>

#include "parquet/arrow/reader.h"
#include "parquet/arrow/writer.h"
Expand All @@ -28,6 +29,7 @@
#include "parquet/platform.h"

#include "arrow/api.h"
#include "arrow/util/logging.h"

using arrow::BooleanBuilder;
using arrow::NumericBuilder;
Expand Down Expand Up @@ -95,15 +97,37 @@ void SetBytesProcessed(::benchmark::State& state) {
state.SetBytesProcessed(bytes_processed);
}

constexpr int64_t kAlternatingOrNa = -1;

template <typename T>
std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't this be factored out in testing/random.h?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'll need to depend on libarrow_testing.so, not sure if this is a problem.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is OK, I'd prefer to save this refactoring for a later point in time, in case more changes are need to this.

const std::array<T, 2>& sample_values) {
std::vector<T> values(BENCHMARK_SIZE, {});
if (true_percentage == kAlternatingOrNa) {
int n = {0};
std::generate(values.begin(), values.end(), [&n] { return n++ % 2; });
} else {
std::default_random_engine rng(500);
double true_probability = static_cast<double>(true_percentage) / 100.0;
std::bernoulli_distribution dist(true_probability);
std::generate(values.begin(), values.end(), [&] { return sample_values[dist(rng)]; });
}
return values;
}

template <typename ParquetType>
std::shared_ptr<::arrow::Table> TableFromVector(
const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
const std::vector<typename ParquetType::c_type>& vec, bool nullable,
int64_t null_percentage = kAlternatingOrNa) {
if (!nullable) {
ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
}
std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
NumericBuilder<ArrowType<ParquetType>> builder;
if (nullable) {
std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
int n = {0};
std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; });
// Note true values select index 1 of sample_values
auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
BENCHMARK_SIZE, /*sample_values=*/{1, 0});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean the valid bitmap only contains bytes 0x00 and 0x01?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, the bitmap only has 0b00000001 and 0b00000000 as possible words, or more-or-less one bit every 8th position.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think that is true it is a confusing contract (maybe taking bool* would be better?) but I read this as converting 1 and 0 to corresponding bits (Under the covers if I traced correctly this calls ArrayBuilder::UnsafeAppendToBitmap which ultimately calls GenerateBitsUnrolled which coverts bytes to bits )

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see. Pity.

EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data()));
} else {
EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), nullptr));
Expand All @@ -118,13 +142,12 @@ std::shared_ptr<::arrow::Table> TableFromVector(

template <>
std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
bool nullable) {
bool nullable,
int64_t null_percentage) {
BooleanBuilder builder;
if (nullable) {
std::vector<bool> valid_bytes(BENCHMARK_SIZE, 0);
int n = {0};
std::generate(valid_bytes.begin(), valid_bytes.end(),
[&n] { return (n++ % 2) != 0; });
auto valid_bytes = RandomVector<bool>(/*true_percentage=*/null_percentage,
BENCHMARK_SIZE, {true, false});
EXIT_NOT_OK(builder.AppendValues(vec, valid_bytes));
} else {
EXIT_NOT_OK(builder.AppendValues(vec));
Expand All @@ -141,7 +164,7 @@ std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<b
template <bool nullable, typename ParquetType>
static void BM_WriteColumn(::benchmark::State& state) {
using T = typename ParquetType::c_type;
std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
std::vector<T> values(BENCHMARK_SIZE, 128);
std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);

while (state.KeepRunning()) {
Expand All @@ -164,12 +187,25 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);

template <typename T>
struct Examples {
static constexpr std::array<T, 2> values() { return {127, 128}; }
};

template <>
struct Examples<bool> {
static constexpr std::array<bool, 2> values() { return {false, true}; }
};

template <bool nullable, typename ParquetType>
static void BM_ReadColumn(::benchmark::State& state) {
using T = typename ParquetType::c_type;

std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
auto values = RandomVector<T>(/*percentage=*/state.range(1), BENCHMARK_SIZE,
Examples<T>::values());

std::shared_ptr<::arrow::Table> table =
TableFromVector<ParquetType>(values, nullable, state.range(0));
auto output = CreateOutputStream();
EXIT_NOT_OK(WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));

Expand All @@ -187,17 +223,62 @@ static void BM_ReadColumn(::benchmark::State& state) {
SetBytesProcessed<nullable, ParquetType>(state);
}

BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);

BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);

BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);

BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
// There are two parameters here that cover different data distributions.
// null_percentage governs distribution and therefore runs of null values.
// first_value_percentage governs distribution of values (we select from 1 of 2)
// so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100)
// there will be some percentage of RLE encoded values and some percentage of literal
// encoded values (RLE is much less likely with percentages close to 50).
BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
->Args({/*null_percentage=*/kAlternatingOrNa, 1})
->Args({/*null_percentage=*/kAlternatingOrNa, 10})
->Args({/*null_percentage=*/kAlternatingOrNa, 50});

BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
->Args({/*null_percentage=*/50, /*first_value_percentage=*/0})
->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});

BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
->Args({/*null_percentage=*/kAlternatingOrNa, 1})
->Args({/*null_percentage=*/kAlternatingOrNa, 10})
->Args({/*null_percentage=*/kAlternatingOrNa, 50});
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
->Args({/*null_percentage=*/5, /*first_value_percentage=*/5})
->Args({/*null_percentage=*/10, /*first_value_percentage=*/5})
->Args({/*null_percentage=*/25, /*first_value_percentage=*/10})
->Args({/*null_percentage=*/30, /*first_value_percentage=*/10})
->Args({/*null_percentage=*/35, /*first_value_percentage=*/10})
->Args({/*null_percentage=*/45, /*first_value_percentage=*/25})
->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
->Args({/*null_percentage=*/50, /*first_value_percentage=*/1})
->Args({/*null_percentage=*/75, /*first_value_percentage=*/1})
->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});

BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType)
->Args({kAlternatingOrNa, 0})
->Args({kAlternatingOrNa, 20});
// Less coverage because int64_t should be pretty good representation for nullability and
// repeating values.
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType)
->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
->Args({/*null_percentage=*/25, /*first_value_percentage=*/25});

BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType)
->Args({kAlternatingOrNa, 0})
->Args({1, 20});
BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});

static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
std::vector<int64_t> values(BENCHMARK_SIZE, 128);
Expand Down