From f6f5f9bf87127421468a0cef83413ea332a909ec Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 27 Mar 2026 10:32:21 -0400 Subject: [PATCH 1/2] scale smj benchmarks and add mark joins --- benchmarks/src/smj.rs | 246 +++++++++++++++++++++++++++--------------- 1 file changed, 161 insertions(+), 85 deletions(-) diff --git a/benchmarks/src/smj.rs b/benchmarks/src/smj.rs index d782762a1be4..3d173b7116e2 100644 --- a/benchmarks/src/smj.rs +++ b/benchmarks/src/smj.rs @@ -39,7 +39,7 @@ use futures::StreamExt; #[derive(Debug, Args, Clone)] #[command(verbatim_doc_comment)] pub struct RunOpt { - /// Query number (between 1 and 23). If not specified, runs all queries + /// Query number (between 1 and 26). If not specified, runs all queries #[arg(short, long)] query: Option, @@ -60,27 +60,27 @@ pub struct RunOpt { /// - Key cardinality (rows per key) /// - Filter selectivity (if applicable) const SMJ_QUERIES: &[&str] = &[ - // Q1: INNER 100K x 100K | 1:1 + // Q1: INNER 1M x 1M | 1:1 r#" WITH t1_sorted AS ( - SELECT value as key FROM range(100000) ORDER BY value + SELECT value as key FROM range(1000000) ORDER BY value ), t2_sorted AS ( - SELECT value as key FROM range(100000) ORDER BY value + SELECT value as key FROM range(1000000) ORDER BY value ) SELECT t1_sorted.key as k1, t2_sorted.key as k2 FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key "#, - // Q2: INNER 100K x 1M | 1:10 + // Q2: INNER 1M x 10M | 1:10 r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2 @@ -101,16 +101,16 @@ const SMJ_QUERIES: &[&str] = &[ SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2 FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key "#, - // Q4: INNER 100K x 1M | 1:10 | 1% + // Q4: INNER 1M x 10M | 1:10 | 1% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2 @@ -133,63 +133,63 @@ const SMJ_QUERIES: &[&str] = &[ FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key WHERE t1_sorted.data <> t2_sorted.data AND t2_sorted.data % 10 = 0 "#, - // Q6: LEFT 100K x 1M | 1:10 + // Q6: LEFT 1M x 10M | 1:10 r#" WITH t1_sorted AS ( - SELECT value % 10500 as key, value as data - FROM range(100000) + SELECT value % 105000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2 FROM t1_sorted LEFT JOIN t2_sorted ON t1_sorted.key = t2_sorted.key "#, - // Q7: LEFT 100K x 1M | 1:10 | 50% + // Q7: LEFT 1M x 10M | 1:10 | 50% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2 FROM t1_sorted LEFT JOIN t2_sorted ON t1_sorted.key = t2_sorted.key WHERE t2_sorted.data IS NULL OR t2_sorted.data % 2 = 0 "#, - // Q8: FULL 100K x 100K | 1:10 + // Q8: FULL 1M x 1M | 1:10 r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 12500 as key, value as data - FROM range(100000) + SELECT value % 125000 as key, value as data + FROM range(1000000) ORDER BY key, data ) SELECT t1_sorted.key as k1, t1_sorted.data as d1, t2_sorted.key as k2, t2_sorted.data as d2 FROM t1_sorted FULL JOIN t2_sorted ON t1_sorted.key = t2_sorted.key "#, - // Q9: FULL 100K x 1M | 1:10 | 10% + // Q9: FULL 1M x 10M | 1:10 | 10% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key as k1, t1_sorted.data as d1, @@ -199,16 +199,16 @@ const SMJ_QUERIES: &[&str] = &[ OR t1_sorted.data <> t2_sorted.data) AND (t1_sorted.data IS NULL OR t1_sorted.data % 10 = 0) "#, - // Q10: LEFT SEMI 100K x 1M | 1:10 + // Q10: LEFT SEMI 1M x 10M | 1:10 r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key - FROM range(1000000) + SELECT value % 100000 as key + FROM range(10000000) ORDER BY key ) SELECT t1_sorted.key, t1_sorted.data @@ -218,16 +218,16 @@ const SMJ_QUERIES: &[&str] = &[ WHERE t2_sorted.key = t1_sorted.key ) "#, - // Q11: LEFT SEMI 100K x 1M | 1:10 | 1% + // Q11: LEFT SEMI 1M x 10M | 1:10 | 1% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data @@ -239,16 +239,16 @@ const SMJ_QUERIES: &[&str] = &[ AND t2_sorted.data % 100 = 0 ) "#, - // Q12: LEFT SEMI 100K x 1M | 1:10 | 50% + // Q12: LEFT SEMI 1M x 10M | 1:10 | 50% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data @@ -260,16 +260,16 @@ const SMJ_QUERIES: &[&str] = &[ AND t2_sorted.data % 2 = 0 ) "#, - // Q13: LEFT SEMI 100K x 1M | 1:10 | 90% + // Q13: LEFT SEMI 1M x 10M | 1:10 | 90% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(1000000) + SELECT value % 100000 as key, value as data + FROM range(10000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data @@ -281,16 +281,16 @@ const SMJ_QUERIES: &[&str] = &[ AND t2_sorted.data % 10 <> 0 ) "#, - // Q14: LEFT ANTI 100K x 1M | 1:10 + // Q14: LEFT ANTI 1M x 10M | 1:10 r#" WITH t1_sorted AS ( - SELECT value % 10500 as key, value as data - FROM range(100000) + SELECT value % 105000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key - FROM range(1000000) + SELECT value % 100000 as key + FROM range(10000000) ORDER BY key ) SELECT t1_sorted.key, t1_sorted.data @@ -300,16 +300,16 @@ const SMJ_QUERIES: &[&str] = &[ WHERE t2_sorted.key = t1_sorted.key ) "#, - // Q15: LEFT ANTI 100K x 1M | 1:10 | partial match + // Q15: LEFT ANTI 1M x 10M | 1:10 | partial match r#" WITH t1_sorted AS ( - SELECT value % 12000 as key, value as data - FROM range(100000) + SELECT value % 120000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key - FROM range(1000000) + SELECT value % 100000 as key + FROM range(10000000) ORDER BY key ) SELECT t1_sorted.key, t1_sorted.data @@ -319,16 +319,16 @@ const SMJ_QUERIES: &[&str] = &[ WHERE t2_sorted.key = t1_sorted.key ) "#, - // Q16: LEFT ANTI 100K x 100K | 1:1 | stress + // Q16: LEFT ANTI 1M x 1M | 1:1 | stress r#" WITH t1_sorted AS ( - SELECT value % 11000 as key, value as data - FROM range(100000) + SELECT value % 110000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key - FROM range(100000) + SELECT value % 100000 as key + FROM range(1000000) ORDER BY key ) SELECT t1_sorted.key, t1_sorted.data @@ -338,32 +338,32 @@ const SMJ_QUERIES: &[&str] = &[ WHERE t2_sorted.key = t1_sorted.key ) "#, - // Q17: INNER 100K x 5M | 1:50 | 5% + // Q17: INNER 1M x 50M | 1:50 | 5% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(5000000) + SELECT value % 100000 as key, value as data + FROM range(50000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data as d1, t2_sorted.data as d2 FROM t1_sorted JOIN t2_sorted ON t1_sorted.key = t2_sorted.key WHERE t2_sorted.data <> t1_sorted.data AND t2_sorted.data % 20 = 0 "#, - // Q18: LEFT SEMI 100K x 5M | 1:50 | 2% + // Q18: LEFT SEMI 1M x 50M | 1:50 | 2% r#" WITH t1_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(100000) + SELECT value % 100000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key, value as data - FROM range(5000000) + SELECT value % 100000 as key, value as data + FROM range(50000000) ORDER BY key, data ) SELECT t1_sorted.key, t1_sorted.data @@ -375,16 +375,16 @@ const SMJ_QUERIES: &[&str] = &[ AND t2_sorted.data % 50 = 0 ) "#, - // Q19: LEFT ANTI 100K x 5M | 1:50 | partial match + // Q19: LEFT ANTI 1M x 50M | 1:50 | partial match r#" WITH t1_sorted AS ( - SELECT value % 15000 as key, value as data - FROM range(100000) + SELECT value % 150000 as key, value as data + FROM range(1000000) ORDER BY key, data ), t2_sorted AS ( - SELECT value % 10000 as key - FROM range(5000000) + SELECT value % 100000 as key + FROM range(50000000) ORDER BY key ) SELECT t1_sorted.key, t1_sorted.data @@ -456,6 +456,72 @@ const SMJ_QUERIES: &[&str] = &[ ON t1_sorted.key = t2_sorted.key AND t1_sorted.data + t2_sorted.data < 10000000 "#, + // Q24: LEFT MARK 1M x 10M | 1:10 | 1% + r#" + WITH t1_sorted AS ( + SELECT value % 100000 as key, value as data + FROM range(1000000) + ORDER BY key, data + ), + t2_sorted AS ( + SELECT value % 100000 as key, value as data + FROM range(10000000) + ORDER BY key, data + ) + SELECT t1_sorted.key, t1_sorted.data + FROM t1_sorted + WHERE t1_sorted.data < 0 + OR EXISTS ( + SELECT 1 FROM t2_sorted + WHERE t2_sorted.key = t1_sorted.key + AND t2_sorted.data <> t1_sorted.data + AND t2_sorted.data % 100 = 0 + ) + "#, + // Q25: LEFT MARK 1M x 10M | 1:10 | 50% + r#" + WITH t1_sorted AS ( + SELECT value % 100000 as key, value as data + FROM range(1000000) + ORDER BY key, data + ), + t2_sorted AS ( + SELECT value % 100000 as key, value as data + FROM range(10000000) + ORDER BY key, data + ) + SELECT t1_sorted.key, t1_sorted.data + FROM t1_sorted + WHERE t1_sorted.data < 0 + OR EXISTS ( + SELECT 1 FROM t2_sorted + WHERE t2_sorted.key = t1_sorted.key + AND t2_sorted.data <> t1_sorted.data + AND t2_sorted.data % 2 = 0 + ) + "#, + // Q26: LEFT MARK 1M x 10M | 1:10 | 90% + r#" + WITH t1_sorted AS ( + SELECT value % 100000 as key, value as data + FROM range(1000000) + ORDER BY key, data + ), + t2_sorted AS ( + SELECT value % 100000 as key, value as data + FROM range(10000000) + ORDER BY key, data + ) + SELECT t1_sorted.key, t1_sorted.data + FROM t1_sorted + WHERE t1_sorted.data < 0 + OR EXISTS ( + SELECT 1 FROM t2_sorted + WHERE t2_sorted.key = t1_sorted.key + AND t2_sorted.data <> t1_sorted.data + AND t2_sorted.data % 10 <> 0 + ) + "#, ]; impl RunOpt { @@ -489,7 +555,10 @@ impl RunOpt { let sql = SMJ_QUERIES[query_index]; benchmark_run.start_new_case(&format!("Query {query_id}")); - let query_run = self.benchmark_query(sql, &query_id.to_string(), &ctx).await; + let expect_mark = query_id >= 24; + let query_run = self + .benchmark_query(sql, &query_id.to_string(), expect_mark, &ctx) + .await; match query_run { Ok(query_results) => { for iter in query_results { @@ -513,6 +582,7 @@ impl RunOpt { &self, sql: &str, query_name: &str, + expect_mark: bool, ctx: &SessionContext, ) -> Result> { let mut query_results = vec![]; @@ -528,6 +598,12 @@ impl RunOpt { )); } + if expect_mark && !plan_string.contains("LeftMark") { + return Err(exec_datafusion_err!( + "Query {query_name} expected LeftMark join. Physical plan: {plan_string}" + )); + } + for i in 0..self.common.iterations { let start = Instant::now(); From 0180e554d19c19042d2227e200e8f55531c628b9 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 27 Mar 2026 11:13:40 -0400 Subject: [PATCH 2/2] remove mark join benchmarks that dont work until #21184. --- benchmarks/src/smj.rs | 80 ++----------------------------------------- 1 file changed, 2 insertions(+), 78 deletions(-) diff --git a/benchmarks/src/smj.rs b/benchmarks/src/smj.rs index 3d173b7116e2..1829b77a3d88 100644 --- a/benchmarks/src/smj.rs +++ b/benchmarks/src/smj.rs @@ -39,7 +39,7 @@ use futures::StreamExt; #[derive(Debug, Args, Clone)] #[command(verbatim_doc_comment)] pub struct RunOpt { - /// Query number (between 1 and 26). If not specified, runs all queries + /// Query number (between 1 and 23). If not specified, runs all queries #[arg(short, long)] query: Option, @@ -456,72 +456,6 @@ const SMJ_QUERIES: &[&str] = &[ ON t1_sorted.key = t2_sorted.key AND t1_sorted.data + t2_sorted.data < 10000000 "#, - // Q24: LEFT MARK 1M x 10M | 1:10 | 1% - r#" - WITH t1_sorted AS ( - SELECT value % 100000 as key, value as data - FROM range(1000000) - ORDER BY key, data - ), - t2_sorted AS ( - SELECT value % 100000 as key, value as data - FROM range(10000000) - ORDER BY key, data - ) - SELECT t1_sorted.key, t1_sorted.data - FROM t1_sorted - WHERE t1_sorted.data < 0 - OR EXISTS ( - SELECT 1 FROM t2_sorted - WHERE t2_sorted.key = t1_sorted.key - AND t2_sorted.data <> t1_sorted.data - AND t2_sorted.data % 100 = 0 - ) - "#, - // Q25: LEFT MARK 1M x 10M | 1:10 | 50% - r#" - WITH t1_sorted AS ( - SELECT value % 100000 as key, value as data - FROM range(1000000) - ORDER BY key, data - ), - t2_sorted AS ( - SELECT value % 100000 as key, value as data - FROM range(10000000) - ORDER BY key, data - ) - SELECT t1_sorted.key, t1_sorted.data - FROM t1_sorted - WHERE t1_sorted.data < 0 - OR EXISTS ( - SELECT 1 FROM t2_sorted - WHERE t2_sorted.key = t1_sorted.key - AND t2_sorted.data <> t1_sorted.data - AND t2_sorted.data % 2 = 0 - ) - "#, - // Q26: LEFT MARK 1M x 10M | 1:10 | 90% - r#" - WITH t1_sorted AS ( - SELECT value % 100000 as key, value as data - FROM range(1000000) - ORDER BY key, data - ), - t2_sorted AS ( - SELECT value % 100000 as key, value as data - FROM range(10000000) - ORDER BY key, data - ) - SELECT t1_sorted.key, t1_sorted.data - FROM t1_sorted - WHERE t1_sorted.data < 0 - OR EXISTS ( - SELECT 1 FROM t2_sorted - WHERE t2_sorted.key = t1_sorted.key - AND t2_sorted.data <> t1_sorted.data - AND t2_sorted.data % 10 <> 0 - ) - "#, ]; impl RunOpt { @@ -555,10 +489,7 @@ impl RunOpt { let sql = SMJ_QUERIES[query_index]; benchmark_run.start_new_case(&format!("Query {query_id}")); - let expect_mark = query_id >= 24; - let query_run = self - .benchmark_query(sql, &query_id.to_string(), expect_mark, &ctx) - .await; + let query_run = self.benchmark_query(sql, &query_id.to_string(), &ctx).await; match query_run { Ok(query_results) => { for iter in query_results { @@ -582,7 +513,6 @@ impl RunOpt { &self, sql: &str, query_name: &str, - expect_mark: bool, ctx: &SessionContext, ) -> Result> { let mut query_results = vec![]; @@ -598,12 +528,6 @@ impl RunOpt { )); } - if expect_mark && !plan_string.contains("LeftMark") { - return Err(exec_datafusion_err!( - "Query {query_name} expected LeftMark join. Physical plan: {plan_string}" - )); - } - for i in 0..self.common.iterations { let start = Instant::now();