From c58890947d460f94466f7e7525f70299dd6138cd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 14 Jan 2024 07:30:47 -0500 Subject: [PATCH] Add "Extended" clickbench queries --- benchmarks/bench.sh | 17 ++++++ benchmarks/queries/clickbench/README.md | 33 +++++++++++ benchmarks/queries/clickbench/README.txt | 1 - benchmarks/queries/clickbench/extended.sql | 1 + benchmarks/src/clickbench.rs | 66 ++++++++++++++-------- 5 files changed, 92 insertions(+), 26 deletions(-) create mode 100644 benchmarks/queries/clickbench/README.md delete mode 100644 benchmarks/queries/clickbench/README.txt create mode 100644 benchmarks/queries/clickbench/extended.sql diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index bdbdc0e51762..ccaf26eb798d 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -74,6 +74,7 @@ parquet: Benchmark of parquet reader's filtering speed sort: Benchmark of sorting speed clickbench_1: ClickBench queries against a single parquet file clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet +clickbench_extended: ClickBench "inspired" queries against a single parquet (DataFusion specific) ********** * Supported Configuration (Environment Variables) @@ -155,6 +156,9 @@ main() { clickbench_partitioned) data_clickbench_partitioned ;; + clickbench_extended) + data_clickbench_1 + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for data generation" usage @@ -193,6 +197,7 @@ main() { run_sort run_clickbench_1 run_clickbench_partitioned + run_clickbench_extended ;; tpch) run_tpch "1" @@ -218,6 +223,9 @@ main() { clickbench_partitioned) run_clickbench_partitioned ;; + clickbench_extended) + run_clickbench_extended + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for run" usage @@ -401,6 +409,15 @@ run_clickbench_partitioned() { $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE} } +# Runs the clickbench "extended" benchmark with a single large parquet file +run_clickbench_extended() { + RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running clickbench (1 file) extended benchmark..." + $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE} +} + + compare_benchmarks() { BASE_RESULTS_DIR="${SCRIPT_DIR}/results" BRANCH1="${ARG2}" diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md new file mode 100644 index 000000000000..d5105afd4832 --- /dev/null +++ b/benchmarks/queries/clickbench/README.md @@ -0,0 +1,33 @@ +# ClickBench queries + +This directory contains queries for the ClickBench benchmark https://benchmark.clickhouse.com/ + +ClickBench is focused on aggregation and filtering performance (though it has no Joins) + +## Files: +* `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository] +* `extended.sql` - "Extended" DataFusion specific queries. + +[ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql + +## "Extended" Queries +The "extended" queries are not part of the official ClickBench benchmark. +Instead they are used to test other DataFusion features that are not +covered by the standard benchmark + +Each description below is for the corresponding line in `extended.sql` (line 1 +is `Q0`, line 2 is `Q1`, etc.) + +### Q0 +Models initial Data exploration, to understand some statistics of data. +Import Query Properties: multiple `COUNT DISTINCT` on strings + +```sql +SELECT + COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") +FROM hits; +``` + + + + diff --git a/benchmarks/queries/clickbench/README.txt b/benchmarks/queries/clickbench/README.txt deleted file mode 100644 index b46900956e54..000000000000 --- a/benchmarks/queries/clickbench/README.txt +++ /dev/null @@ -1 +0,0 @@ -Downloaded from https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql new file mode 100644 index 000000000000..82c0266af61a --- /dev/null +++ b/benchmarks/queries/clickbench/extended.sql @@ -0,0 +1 @@ +SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits; \ No newline at end of file diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs index a6d32eb39f31..69a650a106c7 100644 --- a/benchmarks/src/clickbench.rs +++ b/benchmarks/src/clickbench.rs @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. +use std::path::Path; use std::{path::PathBuf, time::Instant}; use datafusion::{ - common::exec_err, error::{DataFusionError, Result}, prelude::SessionContext, }; +use datafusion_common::exec_datafusion_err; use structopt::StructOpt; use crate::{BenchmarkRun, CommonOpt}; @@ -69,15 +70,49 @@ pub struct RunOpt { output_path: Option, } -const CLICKBENCH_QUERY_START_ID: usize = 0; -const CLICKBENCH_QUERY_END_ID: usize = 42; +struct AllQueries { + queries: Vec, +} + +impl AllQueries { + fn try_new(path: &Path) -> Result { + // ClickBench has all queries in a single file identified by line number + let all_queries = std::fs::read_to_string(path) + .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?; + Ok(Self { + queries: all_queries.lines().map(|s| s.to_string()).collect(), + }) + } + + /// Returns the text of query `query_id` + fn get_query(&self, query_id: usize) -> Result<&str> { + self.queries + .get(query_id) + .ok_or_else(|| { + let min_id = self.min_query_id(); + let max_id = self.max_query_id(); + exec_datafusion_err!( + "Invalid query id {query_id}. Must be between {min_id} and {max_id}" + ) + }) + .map(|s| s.as_str()) + } + + fn min_query_id(&self) -> usize { + 0 + } + fn max_query_id(&self) -> usize { + self.queries.len() - 1 + } +} impl RunOpt { pub async fn run(self) -> Result<()> { println!("Running benchmarks with the following options: {self:?}"); + let queries = AllQueries::try_new(self.queries_path.as_path())?; let query_range = match self.query { Some(query_id) => query_id..=query_id, - None => CLICKBENCH_QUERY_START_ID..=CLICKBENCH_QUERY_END_ID, + None => queries.min_query_id()..=queries.max_query_id(), }; let config = self.common.config(); @@ -88,12 +123,12 @@ impl RunOpt { let mut benchmark_run = BenchmarkRun::new(); for query_id in query_range { benchmark_run.start_new_case(&format!("Query {query_id}")); - let sql = self.get_query(query_id)?; + let sql = queries.get_query(query_id)?; println!("Q{query_id}: {sql}"); for i in 0..iterations { let start = Instant::now(); - let results = ctx.sql(&sql).await?.collect().await?; + let results = ctx.sql(sql).await?.collect().await?; let elapsed = start.elapsed(); let ms = elapsed.as_secs_f64() * 1000.0; let row_count: usize = results.iter().map(|b| b.num_rows()).sum(); @@ -120,23 +155,4 @@ impl RunOpt { ) }) } - - /// Returns the text of query `query_id` - fn get_query(&self, query_id: usize) -> Result { - if query_id > CLICKBENCH_QUERY_END_ID { - return exec_err!( - "Invalid query id {query_id}. Must be between {CLICKBENCH_QUERY_START_ID} and {CLICKBENCH_QUERY_END_ID}" - ); - } - - let path = self.queries_path.as_path(); - - // ClickBench has all queries in a single file identified by line number - let all_queries = std::fs::read_to_string(path).map_err(|e| { - DataFusionError::Execution(format!("Could not open {path:?}: {e}")) - })?; - let all_queries: Vec<_> = all_queries.lines().collect(); - - Ok(all_queries.get(query_id).map(|s| s.to_string()).unwrap()) - } }