From 64f66d9110ec6834d468133b193e0a8b6ee02414 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Wed, 17 Sep 2025 19:48:47 -0400
Subject: [PATCH 1/6] feat: Add Hash Join benchmarks

---
 benchmarks/README.md          |  14 ++
 benchmarks/bench.sh           |  17 +++
 benchmarks/src/bin/dfbench.rs |   4 +-
 benchmarks/src/hj.rs          | 258 ++++++++++++++++++++++++++++++++++
 benchmarks/src/lib.rs         |   1 +
 5 files changed, 293 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/src/hj.rs

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 872500ef849fa..99121c6c1b243 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -727,6 +727,20 @@ Different queries are included to test nested loop joins under various workloads
 ./bench.sh run nlj
 ```
 
+## Hash Join
+
+This benchmark focuses on the performance of queries with nested hash joins, minimizing other overheads such as scanning data sources or evaluating predicates.
+
+Different queries are included to test nested loop joins under various workloads.
+
+### Example Run
+
+```bash
+# No need to generate data: this benchmark uses table function `range()` as the data source
+
+./bench.sh run hj
+```
+
 ## Cancellation
 
 Test performance of cancelling queries.
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index b99ab010058f3..c4988890445b0 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -125,6 +125,7 @@ imdb:                   Join Order Benchmark (JOB) using the IMDB dataset conver
 # Micro-Benchmarks (specific operators and features)
 cancellation:           How long cancelling a query takes
 nlj:                    Benchmark for simple nested loop joins, testing various join scenarios
+hj:                     Benchmark for simple hash joins, testing various join scenarios
 
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Supported Configuration (Environment Variables)
@@ -304,6 +305,10 @@ main() {
                     # nlj uses range() function, no data generation needed
                     echo "NLJ benchmark does not require data generation"
                     ;;
+                hj)
+                    # hj uses range() function, no data generation needed
+                    echo "HJ benchmark does not require data generation"
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
                     usage
@@ -361,6 +366,7 @@ main() {
                     run_imdb
                     run_external_aggr
                     run_nlj
+                    run_hj
                     ;;
                 tpch)
                     run_tpch "1" "parquet"
@@ -468,6 +474,9 @@ main() {
                 nlj)
                     run_nlj
                     ;;
+                hj)
+                    run_hj
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -1103,6 +1112,14 @@ run_nlj() {
     debug_run $CARGO_COMMAND --bin dfbench -- nlj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
+# Runs the hj benchmark
+run_hj() {
+    RESULTS_FILE="${RESULTS_DIR}/hj.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running hj benchmark..."
+    debug_run $CARGO_COMMAND --bin dfbench -- hj --iterations 5 -o "${RESULTS_FILE}" ${QUERY_ARG}
+}
+
 
 compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs
index 88378492b7267..79e834559861d 100644
--- a/benchmarks/src/bin/dfbench.rs
+++ b/benchmarks/src/bin/dfbench.rs
@@ -33,7 +33,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 #[global_allocator]
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
-use datafusion_benchmarks::{cancellation, clickbench, h2o, imdb, nlj, sort_tpch, tpch};
+use datafusion_benchmarks::{cancellation, clickbench, h2o, hj, imdb, nlj, sort_tpch, tpch};
 
 #[derive(Debug, StructOpt)]
 #[structopt(about = "benchmark command")]
@@ -41,6 +41,7 @@ enum Options {
     Cancellation(cancellation::RunOpt),
     Clickbench(clickbench::RunOpt),
     H2o(h2o::RunOpt),
+    HJ(hj::RunOpt),
     Imdb(imdb::RunOpt),
     Nlj(nlj::RunOpt),
     SortTpch(sort_tpch::RunOpt),
@@ -57,6 +58,7 @@ pub async fn main() -> Result<()> {
         Options::Cancellation(opt) => opt.run().await,
         Options::Clickbench(opt) => opt.run().await,
         Options::H2o(opt) => opt.run().await,
+        Options::HJ(opt) => opt.run().await,
         Options::Imdb(opt) => Box::pin(opt.run()).await,
         Options::Nlj(opt) => opt.run().await,
         Options::SortTpch(opt) => opt.run().await,
diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs
new file mode 100644
index 0000000000000..81f1b2bc8831d
--- /dev/null
+++ b/benchmarks/src/hj.rs
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError};
+use structopt::StructOpt;
+
+use futures::StreamExt;
+
+// TODO: Add existence joins
+
+/// Run the Hash Join benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of Hash Joins.
+/// It uses simple equality predicates to ensure a hash join is selected.
+/// Where we vary selectivity, we do so with additional cheap predicates that
+/// do not change the join key (so the physical operator remains HashJoin).
+#[derive(Debug, StructOpt, Clone)]
+#[structopt(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 12). If not specified, runs all queries
+    #[structopt(short, long)]
+    query: Option<usize>,
+
+    /// Common options (iterations, batch size, target_partitions, etc.)
+    #[structopt(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for Hash Join benchmarks
+///
+/// Each query's comment includes:
+///   - Left row count × Right row count
+///   - Join predicate selectivity (approximate output fraction).
+const HASH_QUERIES: &[&str] = &[
+    // Q1: INNER 10K x 10K | LOW ~0.1%
+    // equality on key + cheap filter to downselect
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q2: INNER 10K x 10K | MEDIUM ~20%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 5 = 0
+    "#,
+    // Q3: INNER 10K x 10K | HIGH ~90%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 10 <> 0
+    "#,
+    // Q4: INNER 30K x 30K | MEDIUM ~20%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(30000) AS t1
+        JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 5 = 0
+    "#,
+    // Q5: INNER 10K x 200K | LOW ~0.1% (small to large)
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(200000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q6: INNER 200K x 10K | LOW ~0.1% (large to small)
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(200000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q7: RIGHT OUTER 10K x 200K | LOW ~0.1%
+    // Outer join still uses HashJoin for equi-keys; the extra filter reduces matches
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(10000) AS t1
+        RIGHT JOIN range(200000) AS t2
+          ON t1.value = t2.value
+        WHERE t2.value % 1000 = 0
+    "#,
+    // Q8: LEFT OUTER 200K x 10K | LOW ~0.1%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(200000) AS t1
+        LEFT JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q9: FULL OUTER 30K x 30K | LOW ~0.1%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE COALESCE(t1.value, t2.value) % 1000 = 0
+    "#,
+    // Q10: FULL OUTER 30K x 30K | HIGH ~90%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE COALESCE(t1.value, t2.value) % 10 <> 0
+    "#,
+    // Q11: INNER 30K x 30K | MEDIUM ~50% | cheap predicate on parity
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(30000) AS t1
+        INNER JOIN range(30000) AS t2
+          ON (t1.value % 2) = (t2.value % 2)
+    "#,
+    // Q12: FULL OUTER 30K x 30K | MEDIUM ~50% | expression key
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON (t1.value % 2) = (t2.value % 2)
+    "#,
+];
+
+impl RunOpt {
+    pub async fn run(self) -> Result<()> {
+        println!("Running Hash Join benchmarks with the following options: {self:#?}\n");
+
+        let query_range = match self.query {
+            Some(query_id) => {
+                if query_id >= 1 && query_id <= HASH_QUERIES.len() {
+                    query_id..=query_id
+                } else {
+                    return exec_err!(
+                        "Query {query_id} not found. Available queries: 1 to {}",
+                        HASH_QUERIES.len()
+                    );
+                }
+            }
+            None => 1..=HASH_QUERIES.len(),
+        };
+
+        let config = self.common.config()?;
+        let rt_builder = self.common.runtime_env_builder()?;
+        let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
+
+        let mut benchmark_run = BenchmarkRun::new();
+
+        for query_id in query_range {
+            let query_index = query_id - 1;
+            let sql = HASH_QUERIES[query_index];
+
+            benchmark_run.start_new_case(&format!("Query {query_id}"));
+            let query_run = self.benchmark_query(sql, &query_id.to_string(), &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    return Err(DataFusionError::Context(
+                        format!("Hash Join benchmark Q{query_id} failed with error:"),
+                        Box::new(e),
+                    ));
+                }
+            }
+        }
+
+        benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        Ok(())
+    }
+
+    /// Validates that the physical plan uses a HashJoin, then executes.
+    async fn benchmark_query(
+        &self,
+        sql: &str,
+        query_name: &str,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let mut query_results = vec![];
+
+        // Build/validate plan
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let plan_string = format!("{physical_plan:#?}");
+
+        if !plan_string.contains("HashJoinExec") {
+            return Err(exec_datafusion_err!(
+                "Query {query_name} does not use Hash Join. Physical plan: {plan_string}"
+            ));
+        }
+
+        // Execute without buffering
+        for i in 0..self.common.iterations {
+            let start = Instant::now();
+            let row_count = Self::execute_sql_without_result_buffering(sql, ctx).await?;
+            let elapsed = start.elapsed();
+
+            println!(
+                "Query {query_name} iteration {i} returned {row_count} rows in {elapsed:?}"
+            );
+
+            query_results.push(QueryResult { elapsed, row_count });
+        }
+
+        Ok(query_results)
+    }
+
+    /// Executes the SQL query and drops each batch to avoid result buffering.
+    async fn execute_sql_without_result_buffering(
+        sql: &str,
+        ctx: &SessionContext,
+    ) -> Result<usize> {
+        let mut row_count = 0;
+
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+        let mut stream = execute_stream(physical_plan, ctx.task_ctx())?;
+
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+            // Drop batches immediately to minimize memory pressure
+        }
+
+        Ok(row_count)
+    }
+}
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
index 5d982fad6f77f..07cffa5ae468e 100644
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@@ -19,6 +19,7 @@
 pub mod cancellation;
 pub mod clickbench;
 pub mod h2o;
+pub mod hj;
 pub mod imdb;
 pub mod nlj;
 pub mod sort_tpch;

From a8f3f72b6ae603d5ff54a97003dbc7c4c9553d8d Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Wed, 17 Sep 2025 19:53:12 -0400
Subject: [PATCH 2/6] fmt

---
 benchmarks/src/bin/dfbench.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs
index 79e834559861d..816cae0e38555 100644
--- a/benchmarks/src/bin/dfbench.rs
+++ b/benchmarks/src/bin/dfbench.rs
@@ -33,7 +33,9 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 #[global_allocator]
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
-use datafusion_benchmarks::{cancellation, clickbench, h2o, hj, imdb, nlj, sort_tpch, tpch};
+use datafusion_benchmarks::{
+    cancellation, clickbench, h2o, hj, imdb, nlj, sort_tpch, tpch,
+};
 
 #[derive(Debug, StructOpt)]
 #[structopt(about = "benchmark command")]

From 06e4c63b56bb24fb0051e287e45f7b9510c2a8b4 Mon Sep 17 00:00:00 2001
From: Jonathan Chen <chenleejonathan@gmail.com>
Date: Thu, 18 Sep 2025 22:21:20 -0400
Subject: [PATCH 3/6] Update benchmarks/README.md

Co-authored-by: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
---
 benchmarks/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 99121c6c1b243..a1c5c1b620d5f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -731,7 +731,7 @@ Different queries are included to test nested loop joins under various workloads
 
 This benchmark focuses on the performance of queries with nested hash joins, minimizing other overheads such as scanning data sources or evaluating predicates.
 
-Different queries are included to test nested loop joins under various workloads.
+Several queries are included to test hash joins under various workloads.
 
 ### Example Run
 

From 1ff500c8ba33481a3c0e10f5aeab26d00a9d3df7 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Sun, 21 Sep 2025 00:35:55 -0400
Subject: [PATCH 4/6] add benchmarks

---
 benchmarks/src/hj.rs | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs
index 81f1b2bc8831d..23f4e1220be53 100644
--- a/benchmarks/src/hj.rs
+++ b/benchmarks/src/hj.rs
@@ -58,10 +58,9 @@ const HASH_QUERIES: &[&str] = &[
     // equality on key + cheap filter to downselect
     r#"
         SELECT t1.value, t2.value
-        FROM range(10000) AS t1
+        FROM generate_series(0,10000, 1000) AS t1(value)
         JOIN range(10000) AS t2
-          ON t1.value = t2.value
-        WHERE t1.value % 1000 = 0
+        ON t1.value = t2.value;
     "#,
     // Q2: INNER 10K x 10K | MEDIUM ~20%
     r#"
@@ -150,6 +149,20 @@ const HASH_QUERIES: &[&str] = &[
         FULL JOIN range(30000) AS t2
           ON (t1.value % 2) = (t2.value % 2)
     "#,
+    // Q13: INNER 30K x 30K | MEDIUM ~33% | double predicate
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(30000) AS t1
+        INNER JOIN range(30000) AS t2
+          ON (t1.value = t2.value) AND (t1.value > 10000 and t2.value < 20000)
+    "#,
+    // Q14: FULL OUTER 30K x 30K | MEDIUM ~50% | modulo
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON (t1.value = t2.value) AND ((t1.value+t2.value)%10 = 0)
+    "#,
 ];
 
 impl RunOpt {

From eb7173d0e9b2ddb4a98450ea651508608c43b823 Mon Sep 17 00:00:00 2001
From: Jonathan <chenleejonathan@gmail.com>
Date: Sun, 28 Sep 2025 19:09:28 -0400
Subject: [PATCH 5/6] update selectivities

---
 benchmarks/src/hj.rs | 58 +++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs
index 23f4e1220be53..505b322745485 100644
--- a/benchmarks/src/hj.rs
+++ b/benchmarks/src/hj.rs
@@ -53,19 +53,21 @@ pub struct RunOpt {
 /// Each query's comment includes:
 ///   - Left row count × Right row count
 ///   - Join predicate selectivity (approximate output fraction).
+///   - Q11 and Q12 selectivity is relative to cartesian product while the others are
+///     relative to probe side.
 const HASH_QUERIES: &[&str] = &[
-    // Q1: INNER 10K x 10K | LOW ~0.1%
+    // Q1: INNER 10 x 10K | LOW ~0.1%
     // equality on key + cheap filter to downselect
     r#"
         SELECT t1.value, t2.value
-        FROM generate_series(0,10000, 1000) AS t1(value)
+        FROM generate_series(0, 9000, 1000) AS t1(value)
         JOIN range(10000) AS t2
         ON t1.value = t2.value;
     "#,
-    // Q2: INNER 10K x 10K | MEDIUM ~20%
+    // Q2: INNER 10 x 10K | LOW ~0.1%
     r#"
         SELECT t1.value, t2.value
-        FROM range(10000) AS t1
+        FROM generate_series(0, 9000, 1000) AS t1
         JOIN range(10000) AS t2
           ON t1.value = t2.value
         WHERE t1.value % 5 = 0
@@ -78,90 +80,90 @@ const HASH_QUERIES: &[&str] = &[
           ON t1.value = t2.value
         WHERE t1.value % 10 <> 0
     "#,
-    // Q4: INNER 30K x 30K | MEDIUM ~20%
+    // Q4: INNER 30 x 30K | LOW ~0.1%
     r#"
         SELECT t1.value, t2.value
-        FROM range(30000) AS t1
+        FROM generate_series(0, 29000, 1000) AS t1
         JOIN range(30000) AS t2
           ON t1.value = t2.value
         WHERE t1.value % 5 = 0
     "#,
-    // Q5: INNER 10K x 200K | LOW ~0.1% (small to large)
+    // Q5: INNER 10 x 200K | VERY LOW ~0.005% (small to large)
     r#"
         SELECT t1.value, t2.value
-        FROM range(10000) AS t1
+        FROM generate_series(0, 9000, 1000) AS t1
         JOIN range(200000) AS t2
           ON t1.value = t2.value
         WHERE t1.value % 1000 = 0
     "#,
-    // Q6: INNER 200K x 10K | LOW ~0.1% (large to small)
+    // Q6: INNER 200K x 10 | VERY LOW ~0.005% (large to small)
     r#"
         SELECT t1.value, t2.value
         FROM range(200000) AS t1
-        JOIN range(10000) AS t2
+        JOIN generate_series(0, 9000, 1000) AS t2
           ON t1.value = t2.value
         WHERE t1.value % 1000 = 0
     "#,
-    // Q7: RIGHT OUTER 10K x 200K | LOW ~0.1%
+    // Q7: RIGHT OUTER 10 x 200K | LOW ~0.1%
     // Outer join still uses HashJoin for equi-keys; the extra filter reduces matches
     r#"
         SELECT t1.value AS l, t2.value AS r
-        FROM range(10000) AS t1
+        FROM generate_series(0, 9000, 1000) AS t1
         RIGHT JOIN range(200000) AS t2
           ON t1.value = t2.value
         WHERE t2.value % 1000 = 0
     "#,
-    // Q8: LEFT OUTER 200K x 10K | LOW ~0.1%
+    // Q8: LEFT OUTER 200K x 10 | LOW ~0.1%
     r#"
         SELECT t1.value AS l, t2.value AS r
         FROM range(200000) AS t1
-        LEFT JOIN range(10000) AS t2
+        LEFT JOIN generate_series(0, 9000, 1000) AS t2
           ON t1.value = t2.value
         WHERE t1.value % 1000 = 0
     "#,
-    // Q9: FULL OUTER 30K x 30K | LOW ~0.1%
+    // Q9: FULL OUTER 30 x 30K | LOW ~0.1%
     r#"
         SELECT t1.value AS l, t2.value AS r
-        FROM range(30000) AS t1
+        FROM generate_series(0, 29000, 1000) AS t1
         FULL JOIN range(30000) AS t2
           ON t1.value = t2.value
         WHERE COALESCE(t1.value, t2.value) % 1000 = 0
     "#,
-    // Q10: FULL OUTER 30K x 30K | HIGH ~90%
+    // Q10: FULL OUTER 30 x 30K | HIGH ~90%
     r#"
         SELECT t1.value AS l, t2.value AS r
-        FROM range(30000) AS t1
+        FROM generate_series(0, 29000, 1000) AS t1
         FULL JOIN range(30000) AS t2
           ON t1.value = t2.value
         WHERE COALESCE(t1.value, t2.value) % 10 <> 0
     "#,
-    // Q11: INNER 30K x 30K | MEDIUM ~50% | cheap predicate on parity
+    // Q11: INNER 30 x 30K | MEDIUM ~50% | cheap predicate on parity
     r#"
         SELECT t1.value, t2.value
-        FROM range(30000) AS t1
+        FROM generate_series(0, 29000, 1000) AS t1
         INNER JOIN range(30000) AS t2
           ON (t1.value % 2) = (t2.value % 2)
     "#,
-    // Q12: FULL OUTER 30K x 30K | MEDIUM ~50% | expression key
+    // Q12: FULL OUTER 30 x 30K | MEDIUM ~50% | expression key
     r#"
         SELECT t1.value AS l, t2.value AS r
-        FROM range(30000) AS t1
+        FROM generate_series(0, 29000, 1000) AS t1
         FULL JOIN range(30000) AS t2
           ON (t1.value % 2) = (t2.value % 2)
     "#,
-    // Q13: INNER 30K x 30K | MEDIUM ~33% | double predicate
+    // Q13: INNER 30 x 30K | LOW 0.1% | modulo with adding values
     r#"
         SELECT t1.value, t2.value
-        FROM range(30000) AS t1
+        FROM generate_series(0, 29000, 1000) AS t1
         INNER JOIN range(30000) AS t2
-          ON (t1.value = t2.value) AND (t1.value > 10000 and t2.value < 20000)
+          ON (t1.value = t2.value) AND ((t1.value + t2.value) % 10 < 1)
     "#,
-    // Q14: FULL OUTER 30K x 30K | MEDIUM ~50% | modulo
+    // Q14: FULL OUTER 30 x 30K | ALL ~100% | modulo
     r#"
         SELECT t1.value AS l, t2.value AS r
-        FROM range(30000) AS t1
+        FROM generate_series(0, 29000, 1000) AS t1
         FULL JOIN range(30000) AS t2
-          ON (t1.value = t2.value) AND ((t1.value+t2.value)%10 = 0)
+          ON (t1.value = t2.value) AND ((t1.value + t2.value) % 10 = 0)
     "#,
 ];
 

From 41c3cdb7262c3f8ca2360be9780864932ddb024a Mon Sep 17 00:00:00 2001
From: Yongting You <2010youy01@gmail.com>
Date: Mon, 29 Sep 2025 17:03:11 +0800
Subject: [PATCH 6/6] fix the error introduced when merging main

---
 benchmarks/bench.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index 9b65fa458bb08..dbfd319dd9ad4 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -310,6 +310,7 @@ main() {
                 hj)
                     # hj uses range() function, no data generation needed
                     echo "HJ benchmark does not require data generation"
+                    ;;
                 compile_profile)
                     data_tpch "1"
                     ;;
@@ -492,6 +493,7 @@ main() {
                     ;;
                 hj)
                     run_hj
+                    ;;
                 compile_profile)
                     run_compile_profile "${PROFILE_ARGS[@]}"
                     ;;