Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 192 additions & 0 deletions benchmarks/src/hj.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,198 @@ const HASH_QUERIES: &[HashJoinQuery] = &[
build_size: "100K_(20%_dups)",
probe_size: "60M",
},
// RightSemi Join benchmarks with Int32 keys
// Q16: RightSemi, 100% Density, 100% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be clearer to express these directly using RIGHT SEMI JOIN, for example:

DataFusion CLI v53.1.0
> select count(*)
from generate_series(100) as t1(v1)
right semi join generate_series(100000) as t2(v1)
on t1.v1 > t2.v1;
+----------+
| count(*) |
+----------+
| 100      |
+----------+
1 row(s) fetched.
Elapsed 0.077 seconds.

> select count(*)
from generate_series(100) as t1(v1)
right anti join generate_series(100000) as t2(v1)
on t1.v1 > t2.v1;
+----------+
| count(*) |
+----------+
| 99901    |
+----------+
1 row(s) fetched.
Elapsed 0.007 seconds.

Though, I'm not sure if it's standard SQL 🤔 , but df have them and it's easier to read.

FROM (
SELECT CAST(l_suppkey AS INT) as k FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q17: RightSemi, 100% Density, 10% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q18: RightSemi, 50% Density, 100% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 2 AS INT) as k FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q19: RightSemi, 50% Density, 10% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 2
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 2 + 1
ELSE l_suppkey * 2 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q20: RightSemi, 10% Density, 100% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 10 AS INT) as k FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q21: RightSemi, 10% Density, 10% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 10
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 10 + 1
ELSE l_suppkey * 10 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// RightAnti Join benchmarks with Int32 keys
// Q22: RightAnti, 100% Density, 100% Hit rate (no output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey AS INT) as k FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q23: RightAnti, 100% Density, 10% Hit rate (90% output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q24: RightAnti, 50% Density, 100% Hit rate (no output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 2 AS INT) as k FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q25: RightAnti, 50% Density, 10% Hit rate (90% output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 2
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 2 + 1
ELSE l_suppkey * 2 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q26: RightAnti, 10% Density, 100% Hit rate (no output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 10 AS INT) as k FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q27: RightAnti, 10% Density, 10% Hit rate (90% output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 10
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 10 + 1
ELSE l_suppkey * 10 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightAnti",
},
];

impl RunOpt {
Expand Down
5 changes: 5 additions & 0 deletions datafusion/physical-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,8 @@ required-features = ["test_utils"]
harness = false
name = "aggregate_vectorized"
required-features = ["test_utils"]

[[bench]]
harness = false
name = "hash_join_semi_anti"
required-features = ["test_utils"]
Loading
Loading