From c7f272c10c64db5e78523de176c88f1057bae619 Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 15 Apr 2024 16:53:07 -0700 Subject: [PATCH 1/5] Adding TPCH bencmarks for Sort Merge Join --- benchmarks/bench.sh | 21 +++++++++++++++++++++ benchmarks/src/tpch/run.rs | 10 +++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index a72400892752..2e47aeeaf528 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -213,6 +213,12 @@ main() { tpch_mem10) run_tpch_mem "10" ;; + tpch_smj) + run_tpch_smj "1" + ;; + tpch_smj10) + run_tpch_smj "10" + ;; parquet) run_parquet ;; @@ -320,6 +326,21 @@ run_tpch() { $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE} } +# Runs the tpch benchmark with sort merge join +run_tpch_smj() { + SCALE_FACTOR=$1 + if [ -z "$SCALE_FACTOR" ] ; then + echo "Internal error: Scale factor not specified" + exit 1 + fi + TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}" + + RESULTS_FILE="${RESULTS_DIR}/tpch_smj_sf${SCALE_FACTOR}.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running tpch SMJ benchmark..." + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" -j false --format parquet -o ${RESULTS_FILE} +} + # Runs the tpch in memory run_tpch_mem() { SCALE_FACTOR=$1 diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 564a2f05b6fe..f224b9543ef7 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -42,6 +42,9 @@ use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION}; use log::info; use structopt::StructOpt; +// hack to avoid `default_value is meaningless for bool` errors +type BoolDefaultTrue = bool; + /// Run the tpch benchmark. /// /// This benchmarks is derived from the [TPC-H][1] version @@ -81,6 +84,10 @@ pub struct RunOpt { /// Whether to disable collection of statistics (and cost based optimizations) or not. #[structopt(short = "S", long = "disable-statistics")] disable_statistics: bool, + + /// Whether to disable collection of statistics (and cost based optimizations) or not. + #[structopt(short = "j", long = "hash-join", default_value = "true")] + prefer_hash_join: BoolDefaultTrue, } const TPCH_QUERY_START_ID: usize = 1; @@ -107,10 +114,11 @@ impl RunOpt { } async fn benchmark_query(&self, query_id: usize) -> Result> { - let config = self + let mut config = self .common .config() .with_collect_statistics(!self.disable_statistics); + config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join; let ctx = SessionContext::new_with_config(config); // register tables From 301c827fe0cf5061635107dd05cb641e0dc5f582 Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 16 Apr 2024 08:58:48 -0700 Subject: [PATCH 2/5] Update benchmarks/bench.sh Co-authored-by: Andy Grove --- benchmarks/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 2e47aeeaf528..088edc56dfb0 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -338,7 +338,7 @@ run_tpch_smj() { RESULTS_FILE="${RESULTS_DIR}/tpch_smj_sf${SCALE_FACTOR}.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running tpch SMJ benchmark..." - $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" -j false --format parquet -o ${RESULTS_FILE} + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join false --format parquet -o ${RESULTS_FILE} } # Runs the tpch in memory From 3faeec9ebb4d8db4b81867f470262da2d2dacb33 Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 16 Apr 2024 09:06:13 -0700 Subject: [PATCH 3/5] fix benches --- benchmarks/src/tpch/run.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index f224b9543ef7..346285c3bc39 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -312,7 +312,7 @@ mod tests { use super::*; use datafusion::common::exec_err; - use datafusion::error::{DataFusionError, Result}; + use datafusion::error::Result; use datafusion_proto::bytes::{ logical_plan_from_bytes, logical_plan_to_bytes, physical_plan_from_bytes, physical_plan_to_bytes, @@ -347,6 +347,7 @@ mod tests { mem_table: false, output_path: None, disable_statistics: false, + prefer_hash_join: true }; opt.register_tables(&ctx).await?; let queries = get_query_sql(query)?; @@ -379,6 +380,7 @@ mod tests { mem_table: false, output_path: None, disable_statistics: false, + prefer_hash_join: true }; opt.register_tables(&ctx).await?; let queries = get_query_sql(query)?; From 4d671e2b869eaf553dbb7a92cb807f4fe6969357 Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 16 Apr 2024 09:24:35 -0700 Subject: [PATCH 4/5] fmt --- benchmarks/src/tpch/run.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 346285c3bc39..2329f490ab6b 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -347,7 +347,7 @@ mod tests { mem_table: false, output_path: None, disable_statistics: false, - prefer_hash_join: true + prefer_hash_join: true, }; opt.register_tables(&ctx).await?; let queries = get_query_sql(query)?; @@ -380,7 +380,7 @@ mod tests { mem_table: false, output_path: None, disable_statistics: false, - prefer_hash_join: true + prefer_hash_join: true, }; opt.register_tables(&ctx).await?; let queries = get_query_sql(query)?; From a88b278ca46f652332568836570634d251d2a9e0 Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 16 Apr 2024 09:27:07 -0700 Subject: [PATCH 5/5] comments --- benchmarks/src/tpch/run.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 2329f490ab6b..f2a93d2ea549 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -85,8 +85,9 @@ pub struct RunOpt { #[structopt(short = "S", long = "disable-statistics")] disable_statistics: bool, - /// Whether to disable collection of statistics (and cost based optimizations) or not. - #[structopt(short = "j", long = "hash-join", default_value = "true")] + /// If true then hash join used, if false then sort merge join + /// True by default. + #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] prefer_hash_join: BoolDefaultTrue, }