Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 89 additions & 12 deletions cpp/src/parquet/arrow/reader_writer_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,56 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);

int32_t kInfiniteUniqueValues = -1;

std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique_values,
int64_t null_percentage) {
std::shared_ptr<::arrow::DataType> type = ::arrow::utf8();
std::shared_ptr<::arrow::Array> arr;
::arrow::random::RandomArrayGenerator generator(/*seed=*/500);
double null_probability = static_cast<double>(null_percentage) / 100.0;
if (unique_values == kInfiniteUniqueValues) {
arr = generator.String(length, /*min_length=*/3, /*max_length=*/32,
/*null_probability=*/null_probability);
} else {
arr = generator.StringWithRepeats(length, /*unique=*/unique_values,
/*min_length=*/3, /*max_length=*/32,
/*null_probability=*/null_probability);
}
return ::arrow::Table::Make(
::arrow::schema({::arrow::field("column", type, null_percentage > 0)}), {arr});
}

static void BM_WriteBinaryColumn(::benchmark::State& state) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it use the PLAIN encoding? Add a comment?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a comment near the parameters of each benchmark, explaining we are using the unique_values to trigger the code paths for dictionary and plain encodings. I tried to add a test within the benchmark to validate we are getting the expected encodings. But I found that it was too complicated, as the encodings can change from page to page and also apply to the definition and repetition levels (IIUC).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Can you just confirm that the expected encodings are used (and add a comment)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just saw the comment below, sorry. Please disregard. :-)

std::shared_ptr<::arrow::Table> table =
RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0));

while (state.KeepRunning()) {
auto output = CreateOutputStream();
EXIT_NOT_OK(
WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
}

// Offsets + data
int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size() +
table->column(0)->chunk(0)->data()->buffers[2]->size();
state.SetItemsProcessed(BENCHMARK_SIZE * state.iterations());
state.SetBytesProcessed(total_bytes * state.iterations());
}

BENCHMARK(BM_WriteBinaryColumn)
->ArgNames({"null_probability", "unique_values"})
// We vary unique values to trigger the dictionary-encoded (for low-cardinality)
// and plain (for high-cardinality) code paths.
->Args({0, 32})
->Args({0, kInfiniteUniqueValues})
->Args({1, 32})
->Args({50, 32})
->Args({99, 32})
->Args({1, kInfiniteUniqueValues})
->Args({50, kInfiniteUniqueValues})
->Args({99, kInfiniteUniqueValues});

template <typename T>
struct Examples {
static constexpr std::array<T, 2> values() { return {127, 128}; }
Expand All @@ -208,7 +258,7 @@ struct Examples<bool> {
};

static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
int64_t num_values = -1, int64_t bytes_per_value = -1) {
int64_t num_values = -1, int64_t total_bytes = -1) {
auto output = CreateOutputStream();
EXIT_NOT_OK(
WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows()));
Expand All @@ -228,20 +278,20 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
num_values = table.num_rows();
}
state.SetItemsProcessed(num_values * state.iterations());
if (bytes_per_value != -1) {
state.SetBytesProcessed(num_values * state.iterations() * bytes_per_value);
if (total_bytes != -1) {
state.SetBytesProcessed(total_bytes * state.iterations());
}
}

static void BenchmarkReadArray(::benchmark::State& state,
const std::shared_ptr<Array>& array, bool nullable,
int64_t num_values = -1, int64_t bytes_per_value = -1) {
int64_t num_values = -1, int64_t total_bytes = -1) {
auto schema = ::arrow::schema({field("s", array->type(), nullable)});
auto table = ::arrow::Table::Make(schema, {array}, array->length());

EXIT_NOT_OK(table->Validate());

BenchmarkReadTable(state, *table, num_values, bytes_per_value);
BenchmarkReadTable(state, *table, num_values, total_bytes);
}

//
Expand All @@ -259,7 +309,7 @@ static void BM_ReadColumn(::benchmark::State& state) {
TableFromVector<ParquetType>(values, nullable, state.range(0));

BenchmarkReadTable(state, *table, table->num_rows(),
sizeof(typename ParquetType::c_type));
sizeof(typename ParquetType::c_type) * table->num_rows());
}

// There are two parameters here that cover different data distributions.
Expand Down Expand Up @@ -319,6 +369,33 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});

//
// Benchmark reading binary column
//

static void BM_ReadBinaryColumn(::benchmark::State& state) {
std::shared_ptr<::arrow::Table> table =
RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0));

// Offsets + data
int64_t total_bytes = table->column(0)->chunk(0)->data()->buffers[1]->size() +
table->column(0)->chunk(0)->data()->buffers[2]->size();
BenchmarkReadTable(state, *table, table->num_rows(), total_bytes);
}

BENCHMARK(BM_ReadBinaryColumn)
->ArgNames({"null_probability", "unique_values"})
// We vary unique values to trigger the dictionary-encoded (for low-cardinality)
// and plain (for high-cardinality) code paths.
->Args({0, 32})
->Args({0, kInfiniteUniqueValues})
->Args({1, 32})
->Args({50, 32})
->Args({99, 32})
->Args({1, kInfiniteUniqueValues})
->Args({50, kInfiniteUniqueValues})
->Args({99, kInfiniteUniqueValues});

//
// Benchmark reading a nested column
//
Expand Down Expand Up @@ -383,7 +460,7 @@ static void BM_ReadStructColumn(::benchmark::State& state) {
::arrow::random::RandomArrayGenerator rng(42);
auto array = MakeStructArray(&rng, kNumValues, null_probability);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues);
}

BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments);
Expand All @@ -402,7 +479,7 @@ static void BM_ReadStructOfStructColumn(::benchmark::State& state) {
auto values2 = MakeStructArray(&rng, kNumValues, null_probability);
auto array = MakeStructArray(&rng, {values1, values2}, null_probability);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues);
}

BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments);
Expand All @@ -426,7 +503,7 @@ static void BM_ReadStructOfListColumn(::benchmark::State& state) {
auto array = MakeStructArray(&rng, {list1, list2}, null_probability,
/*propagate_validity =*/true);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues);
}

BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments);
Expand All @@ -445,7 +522,7 @@ static void BM_ReadListColumn(::benchmark::State& state) {

auto array = rng.List(*values, kNumValues / 10, null_probability);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues);
}

BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments);
Expand All @@ -464,7 +541,7 @@ static void BM_ReadListOfStructColumn(::benchmark::State& state) {

auto array = rng.List(*values, kNumValues / 10, null_probability);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues);
}

BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments);
Expand All @@ -484,7 +561,7 @@ static void BM_ReadListOfListColumn(::benchmark::State& state) {
auto inner = rng.List(*values, kNumValues / 10, null_probability);
auto array = rng.List(*inner, kNumValues / 100, null_probability);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue * kNumValues);
}

BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);
Expand Down