diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 9aa776fe054cb..dc0e81a6f36e2 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines the join plan for executing partitions in parallel and then joining the results -//! into a set of partitions. +//! [`HashJoinExec`] Partitioned Hash Join Operator use std::fmt; use std::mem::size_of; @@ -78,29 +77,140 @@ use futures::{ready, Stream, StreamExt, TryStreamExt}; type JoinLeftData = (JoinHashMap, RecordBatch, MemoryReservation); -/// Join execution plan executes partitions in parallel and combines them into a set of -/// partitions. +/// Join execution plan: Evaluates eqijoin predicates in parallel on multiple +/// partitions using a hash table and an optional filter list to apply post +/// join. /// -/// Filter expression expected to contain non-equality predicates that can not be pushed -/// down to any of join inputs. -/// In case of outer join, filter applied to only matched rows. +/// # Join Expressions +/// +/// This implementation is optimized for evaluating eqijoin predicates ( +/// ` = `) expressions, which are represented as a list of `Columns` +/// in [`Self::on`]. +/// +/// Non-equality predicates, which can not pushed down to a join inputs (e.g. +/// ` != `) are known as "filter expressions" and are evaluated +/// after the equijoin predicates. +/// +/// # "Build Side" vs "Probe Side" +/// +/// HashJoin takes two inputs, which are referred to as the "build" and the +/// "probe". The build side is the first child, and the probe side is the second +/// child. +/// +/// The two inputs are treated differently and it is VERY important that the +/// *smaller* input is placed on the build side to minimize the work of creating +/// the hash table. +/// +/// ```text +/// ┌───────────┐ +/// │ HashJoin │ +/// │ │ +/// └───────────┘ +/// │ │ +/// ┌─────┘ └─────┐ +/// ▼ ▼ +/// ┌────────────┐ ┌─────────────┐ +/// │ Input │ │ Input │ +/// │ [0] │ │ [1] │ +/// └────────────┘ └─────────────┘ +/// +/// "build side" "probe side" +/// ``` +/// +/// Execution proceeds in 2 stages: +/// +/// 1. the **build phase** where a hash table is created from the tuples of the +/// build side. +/// +/// 2. the **probe phase** where the tuples of the probe side are streamed +/// through, checking for matches of the join keys in the hash table. +/// +/// ```text +/// ┌────────────────┐ ┌────────────────┐ +/// │ ┌─────────┐ │ │ ┌─────────┐ │ +/// │ │ Hash │ │ │ │ Hash │ │ +/// │ │ Table │ │ │ │ Table │ │ +/// │ │(keys are│ │ │ │(keys are│ │ +/// │ │equi join│ │ │ │equi join│ │ Stage 2: batches from +/// Stage 1: the │ │columns) │ │ │ │columns) │ │ the probe side are +/// *entire* build │ │ │ │ │ │ │ │ streamed through, and +/// side is read │ └─────────┘ │ │ └─────────┘ │ checked against the +/// into the hash │ ▲ │ │ ▲ │ contents of the hash +/// table │ HashJoin │ │ HashJoin │ table +/// └──────┼─────────┘ └──────────┼─────┘ +/// ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// │ │ +/// +/// │ │ +/// ┌────────────┐ ┌────────────┐ +/// │RecordBatch │ │RecordBatch │ +/// └────────────┘ └────────────┘ +/// ┌────────────┐ ┌────────────┐ +/// │RecordBatch │ │RecordBatch │ +/// └────────────┘ └────────────┘ +/// ... ... +/// ┌────────────┐ ┌────────────┐ +/// │RecordBatch │ │RecordBatch │ +/// └────────────┘ └────────────┘ +/// +/// build side probe side +/// +/// ``` +/// +/// # Example "Optimal" Plans +/// +/// The differences in the inputs means that for classic "Star Schema Query", +/// the optimal plan will be a **"Right Deep Tree"** . A Star Schema Query is +/// one where there is one large table and several smaller "dimension" tables, +/// joined on `Foreign Key = Primary Key` predicates. +/// +/// A "Right Deep Tree" looks like this large table as the probe side on the +/// lowest join: +/// +/// ```text +/// ┌───────────┐ +/// │ HashJoin │ +/// │ │ +/// └───────────┘ +/// │ │ +/// ┌───────┘ └──────────┐ +/// ▼ ▼ +/// ┌───────────────┐ ┌───────────┐ +/// │ small table 1 │ │ HashJoin │ +/// │ "dimension" │ │ │ +/// └───────────────┘ └───┬───┬───┘ +/// ┌──────────┘ └───────┐ +/// │ │ +/// ▼ ▼ +/// ┌───────────────┐ ┌───────────┐ +/// │ small table 2 │ │ HashJoin │ +/// │ "dimension" │ │ │ +/// └───────────────┘ └───┬───┬───┘ +/// ┌────────┘ └────────┐ +/// │ │ +/// ▼ ▼ +/// ┌───────────────┐ ┌───────────────┐ +/// │ small table 3 │ │ large table │ +/// │ "dimension" │ │ "fact" │ +/// └───────────────┘ └───────────────┘ +/// ``` #[derive(Debug)] pub struct HashJoinExec { /// left (build) side which gets hashed pub left: Arc, /// right (probe) side which are filtered by the hash table pub right: Arc, - /// Set of common columns used to join on + /// Set of equijoin columns from the relations: `(left_col, right_col)` pub on: Vec<(Column, Column)>, /// Filters which are applied while finding matching rows pub filter: Option, - /// How the join is performed + /// How the join is performed (`OUTER`, `INNER`, etc) pub join_type: JoinType, - /// The schema once the join is applied + /// The output schema for the join schema: SchemaRef, /// Build-side data left_fut: OnceAsync, - /// Shares the `RandomState` for the hashing algorithm + /// Shared the `RandomState` for the hashing algorithm random_state: RandomState, /// Output order output_order: Option>, @@ -110,12 +220,16 @@ pub struct HashJoinExec { metrics: ExecutionPlanMetricsSet, /// Information of index and left / right placement of columns column_indices: Vec, - /// If null_equals_null is true, null == null else null != null + /// Null matching behavior: If `null_equals_null` is true, rows that have + /// `null`s in both left and right equijoin columns will be matched. + /// Otherwise, rows that have `null`s in the join columns will not be + /// matched and thus will not appear in the output. pub null_equals_null: bool, } impl HashJoinExec { /// Tries to create a new [HashJoinExec]. + /// /// # Error /// This function errors when it is not possible to join the left and right sides on keys `on`. pub fn try_new(