From 475f166340ee5e67b2f133307aac98d339180841 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Tue, 26 Apr 2022 12:07:17 +0800 Subject: [PATCH 1/8] first move: re-group aggregates functionalities in core/physical_p/aggregates --- .../rust/core/src/serde/physical_plan/mod.rs | 12 +- ballista/rust/core/src/utils.rs | 4 +- ballista/rust/scheduler/src/planner.rs | 12 +- .../aggregate_statistics.rs | 43 +- .../src/physical_optimizer/repartition.rs | 6 +- .../core/src/physical_plan/aggregates/hash.rs | 477 ++++++ .../core/src/physical_plan/aggregates/mod.rs | 719 +++++++++ .../physical_plan/aggregates/no_grouping.rs | 165 +++ .../core/src/physical_plan/hash_aggregate.rs | 1299 ----------------- datafusion/core/src/physical_plan/mod.rs | 1 - datafusion/core/src/physical_plan/planner.rs | 8 +- datafusion/core/tests/sql/explain_analyze.rs | 2 +- 12 files changed, 1403 insertions(+), 1345 deletions(-) create mode 100644 datafusion/core/src/physical_plan/aggregates/hash.rs create mode 100644 datafusion/core/src/physical_plan/aggregates/no_grouping.rs delete mode 100644 datafusion/core/src/physical_plan/hash_aggregate.rs diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs index ed268820f3947..81e305e2ade99 100644 --- a/ballista/rust/core/src/serde/physical_plan/mod.rs +++ b/ballista/rust/core/src/serde/physical_plan/mod.rs @@ -28,7 +28,8 @@ use datafusion::datasource::listing::PartitionedFile; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::logical_plan::window_frames::WindowFrame; use datafusion::logical_plan::FunctionRegistry; -use datafusion::physical_plan::aggregates::create_aggregate_expr; +use datafusion::physical_plan::aggregates::AggregateExec; +use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateMode}; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::cross_join::CrossJoinExec; @@ -39,7 +40,6 @@ use datafusion::physical_plan::file_format::{ AvroExec, CsvExec, FileScanConfig, ParquetExec, }; use datafusion::physical_plan::filter::FilterExec; -use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::{HashJoinExec, PartitionMode}; use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion::physical_plan::projection::ProjectionExec; @@ -391,7 +391,7 @@ impl AsExecutionPlan for PhysicalPlanNode { }) .collect::, _>>()?; - Ok(Arc::new(HashAggregateExec::try_new( + Ok(Arc::new(AggregateExec::try_new( agg_mode, group, physical_aggr_expr, @@ -730,7 +730,7 @@ impl AsExecutionPlan for PhysicalPlanNode { }, ))), }) - } else if let Some(exec) = plan.downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { let groups = exec .group_expr() .iter() @@ -1080,12 +1080,12 @@ mod roundtrip_tests { datasource::listing::PartitionedFile, logical_plan::{JoinType, Operator}, physical_plan::{ + aggregates::{AggregateExec, AggregateMode}, empty::EmptyExec, expressions::{binary, col, lit, InListExpr, NotExpr}, expressions::{Avg, Column, PhysicalSortExpr}, file_format::{FileScanConfig, ParquetExec}, filter::FilterExec, - hash_aggregate::{AggregateMode, HashAggregateExec}, hash_join::{HashJoinExec, PartitionMode}, limit::{GlobalLimitExec, LocalLimitExec}, sorts::sort::SortExec, @@ -1226,7 +1226,7 @@ mod roundtrip_tests { DataType::Float64, ))]; - roundtrip_test(Arc::new(HashAggregateExec::try_new( + roundtrip_test(Arc::new(AggregateExec::try_new( AggregateMode::Final, groups.clone(), aggregates.clone(), diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 6670ab5cedd83..310f925936b3a 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -48,9 +48,9 @@ use datafusion::physical_plan::common::batch_byte_size; use datafusion::physical_plan::empty::EmptyExec; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::physical_plan::aggregates::AggregateExec; use datafusion::physical_plan::file_format::{CsvExec, ParquetExec}; use datafusion::physical_plan::filter::FilterExec; -use datafusion::physical_plan::hash_aggregate::HashAggregateExec; use datafusion::physical_plan::hash_join::HashJoinExec; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::sorts::sort::SortExec; @@ -151,7 +151,7 @@ fn build_exec_plan_diagram( id: &mut AtomicUsize, draw_entity: bool, ) -> Result { - let operator_str = if plan.as_any().downcast_ref::().is_some() { + let operator_str = if plan.as_any().downcast_ref::().is_some() { "HashAggregateExec" } else if plan.as_any().downcast_ref::().is_some() { "SortExec" diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 8198c4ed27c0d..0d2de5089ae30 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -276,8 +276,8 @@ mod test { use ballista_core::error::BallistaError; use ballista_core::execution_plans::UnresolvedShuffleExec; use ballista_core::serde::{protobuf, AsExecutionPlan, BallistaCodec}; + use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode}; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; - use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::HashJoinExec; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::{ @@ -346,14 +346,14 @@ mod test { // verify stage 0 let stage0 = stages[0].children()[0].clone(); - let partial_hash = downcast_exec!(stage0, HashAggregateExec); + let partial_hash = downcast_exec!(stage0, AggregateExec); assert!(*partial_hash.mode() == AggregateMode::Partial); // verify stage 1 let stage1 = stages[1].children()[0].clone(); let projection = downcast_exec!(stage1, ProjectionExec); let final_hash = projection.children()[0].clone(); - let final_hash = downcast_exec!(final_hash, HashAggregateExec); + let final_hash = downcast_exec!(final_hash, AggregateExec); assert!(*final_hash.mode() == AggregateMode::FinalPartitioned); let coalesce = final_hash.children()[0].clone(); let coalesce = downcast_exec!(coalesce, CoalesceBatchesExec); @@ -514,7 +514,7 @@ order by .partition_count() ); - let hash_agg = downcast_exec!(input, HashAggregateExec); + let hash_agg = downcast_exec!(input, AggregateExec); let coalesce_batches = hash_agg.children()[0].clone(); let coalesce_batches = downcast_exec!(coalesce_batches, CoalesceBatchesExec); @@ -586,8 +586,8 @@ order by let partial_hash = stages[0].children()[0].clone(); let partial_hash_serde = roundtrip_operator(partial_hash.clone())?; - let partial_hash = downcast_exec!(partial_hash, HashAggregateExec); - let partial_hash_serde = downcast_exec!(partial_hash_serde, HashAggregateExec); + let partial_hash = downcast_exec!(partial_hash, AggregateExec); + let partial_hash_serde = downcast_exec!(partial_hash_serde, AggregateExec); assert_eq!( format!("{:?}", partial_hash), diff --git a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs index 9af053f934fb9..f8004516738d5 100644 --- a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs @@ -21,8 +21,8 @@ use std::sync::Arc; use arrow::datatypes::Schema; use crate::execution::context::SessionConfig; +use crate::physical_plan::aggregates::{AggregateExec, AggregateMode}; use crate::physical_plan::empty::EmptyExec; -use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::{ expressions, AggregateExpr, ColumnStatistics, ExecutionPlan, Statistics, @@ -53,7 +53,7 @@ impl PhysicalOptimizerRule for AggregateStatistics { if let Some(partial_agg_exec) = take_optimizable(&*plan) { let partial_agg_exec = partial_agg_exec .as_any() - .downcast_ref::() + .downcast_ref::() .expect("take_optimizable() ensures that this is a HashAggregateExec"); let stats = partial_agg_exec.input().statistics(); let mut projections = vec![]; @@ -104,14 +104,14 @@ impl PhysicalOptimizerRule for AggregateStatistics { /// We would have prefered to return a casted ref to HashAggregateExec but the recursion requires /// the `ExecutionPlan.children()` method that returns an owned reference. fn take_optimizable(node: &dyn ExecutionPlan) -> Option> { - if let Some(final_agg_exec) = node.as_any().downcast_ref::() { + if let Some(final_agg_exec) = node.as_any().downcast_ref::() { if final_agg_exec.mode() == &AggregateMode::Final && final_agg_exec.group_expr().is_empty() { let mut child = Arc::clone(final_agg_exec.input()); loop { if let Some(partial_agg_exec) = - child.as_any().downcast_ref::() + child.as_any().downcast_ref::() { if partial_agg_exec.mode() == &AggregateMode::Partial && partial_agg_exec.group_expr().is_empty() @@ -260,11 +260,11 @@ mod tests { use crate::error::Result; use crate::logical_plan::Operator; + use crate::physical_plan::aggregates::AggregateExec; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::common; use crate::physical_plan::expressions::Count; use crate::physical_plan::filter::FilterExec; - use crate::physical_plan::hash_aggregate::HashAggregateExec; use crate::physical_plan::memory::MemoryExec; use crate::prelude::SessionContext; @@ -291,10 +291,7 @@ mod tests { } /// Checks that the count optimization was applied and we still get the right result - async fn assert_count_optim_success( - plan: HashAggregateExec, - nulls: bool, - ) -> Result<()> { + async fn assert_count_optim_success(plan: AggregateExec, nulls: bool) -> Result<()> { let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let conf = session_ctx.copied_config(); @@ -336,7 +333,7 @@ mod tests { let source = mock_data()?; let schema = source.schema(); - let partial_agg = HashAggregateExec::try_new( + let partial_agg = AggregateExec::try_new( AggregateMode::Partial, vec![], vec![count_expr(None, None)], @@ -344,7 +341,7 @@ mod tests { Arc::clone(&schema), )?; - let final_agg = HashAggregateExec::try_new( + let final_agg = AggregateExec::try_new( AggregateMode::Final, vec![], vec![count_expr(None, None)], @@ -363,7 +360,7 @@ mod tests { let source = mock_data()?; let schema = source.schema(); - let partial_agg = HashAggregateExec::try_new( + let partial_agg = AggregateExec::try_new( AggregateMode::Partial, vec![], vec![count_expr(Some(&schema), Some("a"))], @@ -371,7 +368,7 @@ mod tests { Arc::clone(&schema), )?; - let final_agg = HashAggregateExec::try_new( + let final_agg = AggregateExec::try_new( AggregateMode::Final, vec![], vec![count_expr(Some(&schema), Some("a"))], @@ -389,7 +386,7 @@ mod tests { let source = mock_data()?; let schema = source.schema(); - let partial_agg = HashAggregateExec::try_new( + let partial_agg = AggregateExec::try_new( AggregateMode::Partial, vec![], vec![count_expr(None, None)], @@ -400,7 +397,7 @@ mod tests { // We introduce an intermediate optimization step between the partial and final aggregtator let coalesce = CoalescePartitionsExec::new(Arc::new(partial_agg)); - let final_agg = HashAggregateExec::try_new( + let final_agg = AggregateExec::try_new( AggregateMode::Final, vec![], vec![count_expr(None, None)], @@ -418,7 +415,7 @@ mod tests { let source = mock_data()?; let schema = source.schema(); - let partial_agg = HashAggregateExec::try_new( + let partial_agg = AggregateExec::try_new( AggregateMode::Partial, vec![], vec![count_expr(Some(&schema), Some("a"))], @@ -429,7 +426,7 @@ mod tests { // We introduce an intermediate optimization step between the partial and final aggregtator let coalesce = CoalescePartitionsExec::new(Arc::new(partial_agg)); - let final_agg = HashAggregateExec::try_new( + let final_agg = AggregateExec::try_new( AggregateMode::Final, vec![], vec![count_expr(Some(&schema), Some("a"))], @@ -458,7 +455,7 @@ mod tests { source, )?); - let partial_agg = HashAggregateExec::try_new( + let partial_agg = AggregateExec::try_new( AggregateMode::Partial, vec![], vec![count_expr(None, None)], @@ -466,7 +463,7 @@ mod tests { Arc::clone(&schema), )?; - let final_agg = HashAggregateExec::try_new( + let final_agg = AggregateExec::try_new( AggregateMode::Final, vec![], vec![count_expr(None, None)], @@ -479,7 +476,7 @@ mod tests { AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?; // check that the original ExecutionPlan was not replaced - assert!(optimized.as_any().is::()); + assert!(optimized.as_any().is::()); Ok(()) } @@ -500,7 +497,7 @@ mod tests { source, )?); - let partial_agg = HashAggregateExec::try_new( + let partial_agg = AggregateExec::try_new( AggregateMode::Partial, vec![], vec![count_expr(Some(&schema), Some("a"))], @@ -508,7 +505,7 @@ mod tests { Arc::clone(&schema), )?; - let final_agg = HashAggregateExec::try_new( + let final_agg = AggregateExec::try_new( AggregateMode::Final, vec![], vec![count_expr(Some(&schema), Some("a"))], @@ -521,7 +518,7 @@ mod tests { AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?; // check that the original ExecutionPlan was not replaced - assert!(optimized.as_any().is::()); + assert!(optimized.as_any().is::()); Ok(()) } diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs index 2506348fe7a05..cab7ec5d4a43c 100644 --- a/datafusion/core/src/physical_optimizer/repartition.rs +++ b/datafusion/core/src/physical_optimizer/repartition.rs @@ -241,10 +241,10 @@ mod tests { use super::*; use crate::datasource::listing::PartitionedFile; + use crate::physical_plan::aggregates::{AggregateExec, AggregateMode}; use crate::physical_plan::expressions::{col, PhysicalSortExpr}; use crate::physical_plan::file_format::{FileScanConfig, ParquetExec}; use crate::physical_plan::filter::FilterExec; - use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::sorts::sort::SortExec; @@ -303,12 +303,12 @@ mod tests { fn hash_aggregate(input: Arc) -> Arc { let schema = schema(); Arc::new( - HashAggregateExec::try_new( + AggregateExec::try_new( AggregateMode::Final, vec![], vec![], Arc::new( - HashAggregateExec::try_new( + AggregateExec::try_new( AggregateMode::Partial, vec![], vec![], diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs new file mode 100644 index 0000000000000..85e82f14c55d5 --- /dev/null +++ b/datafusion/core/src/physical_plan/aggregates/hash.rs @@ -0,0 +1,477 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines the execution plan for the hash aggregate operation + +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::vec; + +use ahash::RandomState; +use futures::{ + ready, + stream::{Stream, StreamExt}, +}; + +use crate::error::Result; +use crate::physical_plan::aggregates::{AccumulatorItem, AggregateMode}; +use crate::physical_plan::hash_utils::create_hashes; +use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; +use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; +use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; +use crate::scalar::ScalarValue; + +use arrow::{array::ArrayRef, compute, compute::cast}; +use arrow::{ + array::{Array, UInt32Builder}, + error::{ArrowError, Result as ArrowResult}, +}; +use arrow::{ + datatypes::{Schema, SchemaRef}, + record_batch::RecordBatch, +}; +use hashbrown::raw::RawTable; + +/* +The architecture is the following: + +1. An accumulator has state that is updated on each batch. +2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row +3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch. +4. The state's RecordBatch is `merge`d to a new state +5. The state is mapped to the final value + +Why: + +* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array` +* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge +* It uses Arrow's native dynamically typed object, `Array`. +* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant. + +Example: average + +* the state is `n: u32` and `sum: f64` +* For every batch, we update them accordingly. +* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]` +* The RecordBatch is (sent back / transmitted over network) +* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns. +* Finally, `get_value` returns an array with one entry computed from the state +*/ +pub(crate) struct GroupedHashAggregateStream { + schema: SchemaRef, + input: SendableRecordBatchStream, + mode: AggregateMode, + accumulators: Accumulators, + aggregate_expressions: Vec>>, + + aggr_expr: Vec>, + group_expr: Vec>, + + baseline_metrics: BaselineMetrics, + random_state: RandomState, + finished: bool, +} + +impl GroupedHashAggregateStream { + /// Create a new HashAggregateStream + pub fn new( + mode: AggregateMode, + schema: SchemaRef, + group_expr: Vec>, + aggr_expr: Vec>, + input: SendableRecordBatchStream, + baseline_metrics: BaselineMetrics, + ) -> Result { + let timer = baseline_metrics.elapsed_compute().timer(); + + // The expressions to evaluate the batch, one vec of expressions per aggregation. + // Assume create_schema() always put group columns in front of aggr columns, we set + // col_idx_base to group expression count. + let aggregate_expressions = + aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?; + + timer.done(); + + Ok(Self { + schema, + mode, + input, + aggr_expr, + group_expr, + baseline_metrics, + aggregate_expressions, + accumulators: Default::default(), + random_state: Default::default(), + finished: false, + }) + } +} + +impl Stream for GroupedHashAggregateStream { + type Item = ArrowResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let this = &mut *self; + if this.finished { + return Poll::Ready(None); + } + + let elapsed_compute = this.baseline_metrics.elapsed_compute(); + + loop { + let result = match ready!(this.input.poll_next_unpin(cx)) { + Some(Ok(batch)) => { + let timer = elapsed_compute.timer(); + let result = group_aggregate_batch( + &this.mode, + &this.random_state, + &this.group_expr, + &this.aggr_expr, + batch, + &mut this.accumulators, + &this.aggregate_expressions, + ); + + timer.done(); + + match result { + Ok(_) => continue, + Err(e) => Err(ArrowError::ExternalError(Box::new(e))), + } + } + Some(Err(e)) => Err(e), + None => { + this.finished = true; + let timer = this.baseline_metrics.elapsed_compute().timer(); + let result = create_batch_from_map( + &this.mode, + &this.accumulators, + this.group_expr.len(), + &this.schema, + ) + .record_output(&this.baseline_metrics); + + timer.done(); + result + } + }; + + this.finished = true; + return Poll::Ready(Some(result)); + } + } +} + +impl RecordBatchStream for GroupedHashAggregateStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +/// TODO: Make this a member function of [`GroupedHashAggregateStream`] +fn group_aggregate_batch( + mode: &AggregateMode, + random_state: &RandomState, + group_expr: &[Arc], + aggr_expr: &[Arc], + batch: RecordBatch, + accumulators: &mut Accumulators, + aggregate_expressions: &[Vec>], +) -> Result<()> { + // evaluate the grouping expressions + let group_values = evaluate(group_expr, &batch)?; + + // evaluate the aggregation expressions. + // We could evaluate them after the `take`, but since we need to evaluate all + // of them anyways, it is more performant to do it while they are together. + let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?; + + // 1.1 construct the key from the group values + // 1.2 construct the mapping key if it does not exist + // 1.3 add the row' index to `indices` + + // track which entries in `accumulators` have rows in this batch to aggregate + let mut groups_with_rows = vec![]; + + // 1.1 Calculate the group keys for the group values + let mut batch_hashes = vec![0; batch.num_rows()]; + create_hashes(&group_values, random_state, &mut batch_hashes)?; + + for (row, hash) in batch_hashes.into_iter().enumerate() { + let Accumulators { map, group_states } = accumulators; + + let entry = map.get_mut(hash, |(_hash, group_idx)| { + // verify that a group that we are inserting with hash is + // actually the same key value as the group in + // existing_idx (aka group_values @ row) + let group_state = &group_states[*group_idx]; + group_values + .iter() + .zip(group_state.group_by_values.iter()) + .all(|(array, scalar)| scalar.eq_array(array, row)) + }); + + match entry { + // Existing entry for this group value + Some((_hash, group_idx)) => { + let group_state = &mut group_states[*group_idx]; + // 1.3 + if group_state.indices.is_empty() { + groups_with_rows.push(*group_idx); + }; + group_state.indices.push(row as u32); // remember this row + } + // 1.2 Need to create new entry + None => { + let accumulator_set = aggregates::create_accumulators(aggr_expr)?; + + // Copy group values out of arrays into `ScalarValue`s + let group_by_values = group_values + .iter() + .map(|col| ScalarValue::try_from_array(col, row)) + .collect::>>()?; + + // Add new entry to group_states and save newly created index + let group_state = GroupState { + group_by_values: group_by_values.into_boxed_slice(), + accumulator_set, + indices: vec![row as u32], // 1.3 + }; + let group_idx = group_states.len(); + group_states.push(group_state); + groups_with_rows.push(group_idx); + + // for hasher function, use precomputed hash value + map.insert(hash, (hash, group_idx), |(hash, _group_idx)| *hash); + } + }; + } + + // Collect all indices + offsets based on keys in this vec + let mut batch_indices: UInt32Builder = UInt32Builder::new(0); + let mut offsets = vec![0]; + let mut offset_so_far = 0; + for group_idx in groups_with_rows.iter() { + let indices = &accumulators.group_states[*group_idx].indices; + batch_indices.append_slice(indices)?; + offset_so_far += indices.len(); + offsets.push(offset_so_far); + } + let batch_indices = batch_indices.finish(); + + // `Take` all values based on indices into Arrays + let values: Vec>> = aggr_input_values + .iter() + .map(|array| { + array + .iter() + .map(|array| { + compute::take( + array.as_ref(), + &batch_indices, + None, // None: no index check + ) + .unwrap() + }) + .collect() + // 2.3 + }) + .collect(); + + // 2.1 for each key in this batch + // 2.2 for each aggregation + // 2.3 `slice` from each of its arrays the keys' values + // 2.4 update / merge the accumulator with the values + // 2.5 clear indices + groups_with_rows + .iter() + .zip(offsets.windows(2)) + .try_for_each(|(group_idx, offsets)| { + let group_state = &mut accumulators.group_states[*group_idx]; + // 2.2 + group_state + .accumulator_set + .iter_mut() + .zip(values.iter()) + .map(|(accumulator, aggr_array)| { + ( + accumulator, + aggr_array + .iter() + .map(|array| { + // 2.3 + array.slice(offsets[0], offsets[1] - offsets[0]) + }) + .collect::>(), + ) + }) + .try_for_each(|(accumulator, values)| match mode { + AggregateMode::Partial => accumulator.update_batch(&values), + AggregateMode::FinalPartitioned | AggregateMode::Final => { + // note: the aggregation here is over states, not values, thus the merge + accumulator.merge_batch(&values) + } + }) + // 2.5 + .and({ + group_state.indices.clear(); + Ok(()) + }) + })?; + + Ok(()) +} + +/// The state that is built for each output group. +#[derive(Debug)] +struct GroupState { + /// The actual group by values, one for each group column + group_by_values: Box<[ScalarValue]>, + + // Accumulator state, one for each aggregate + accumulator_set: Vec, + + /// scratch space used to collect indices for input rows in a + /// bach that have values to aggregate. Reset on each batch + indices: Vec, +} + +/// The state of all the groups +#[derive(Default)] +struct Accumulators { + /// Logically maps group values to an index in `group_states` + /// + /// Uses the raw API of hashbrown to avoid actually storing the + /// keys in the table + /// + /// keys: u64 hashes of the GroupValue + /// values: (hash, index into `group_states`) + map: RawTable<(u64, usize)>, + + /// State for each group + group_states: Vec, +} + +impl std::fmt::Debug for Accumulators { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + // hashes are not store inline, so could only get values + let map_string = "RawTable"; + f.debug_struct("Accumulators") + .field("map", &map_string) + .field("group_states", &self.group_states) + .finish() + } +} + +/// Evaluates expressions against a record batch. +fn evaluate( + expr: &[Arc], + batch: &RecordBatch, +) -> Result> { + expr.iter() + .map(|expr| expr.evaluate(batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>() +} + +/// Evaluates expressions against a record batch. +fn evaluate_many( + expr: &[Vec>], + batch: &RecordBatch, +) -> Result>> { + expr.iter() + .map(|expr| evaluate(expr, batch)) + .collect::>>() +} + +/// Create a RecordBatch with all group keys and accumulator' states or values. +fn create_batch_from_map( + mode: &AggregateMode, + accumulators: &Accumulators, + num_group_expr: usize, + output_schema: &Schema, +) -> ArrowResult { + if accumulators.group_states.is_empty() { + return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned()))); + } + let accs = &accumulators.group_states[0].accumulator_set; + let mut acc_data_types: Vec = vec![]; + + // Calculate number/shape of state arrays + match mode { + AggregateMode::Partial => { + for acc in accs.iter() { + let state = acc.state()?; + acc_data_types.push(state.len()); + } + } + AggregateMode::Final | AggregateMode::FinalPartitioned => { + acc_data_types = vec![1; accs.len()]; + } + } + + let mut columns = (0..num_group_expr) + .map(|i| { + ScalarValue::iter_to_array( + accumulators + .group_states + .iter() + .map(|group_state| group_state.group_by_values[i].clone()), + ) + }) + .collect::>>()?; + + // add state / evaluated arrays + for (x, &state_len) in acc_data_types.iter().enumerate() { + for y in 0..state_len { + match mode { + AggregateMode::Partial => { + let res = ScalarValue::iter_to_array( + accumulators.group_states.iter().map(|group_state| { + let x = group_state.accumulator_set[x].state().unwrap(); + x[y].clone() + }), + )?; + + columns.push(res); + } + AggregateMode::Final | AggregateMode::FinalPartitioned => { + let res = ScalarValue::iter_to_array( + accumulators.group_states.iter().map(|group_state| { + group_state.accumulator_set[x].evaluate().unwrap() + }), + )?; + columns.push(res); + } + } + } + } + + // cast output if needed (e.g. for types like Dictionary where + // the intermediate GroupByScalar type was not the same as the + // output + let columns = columns + .iter() + .zip(output_schema.fields().iter()) + .map(|(col, desired_field)| cast(col, desired_field.data_type())) + .collect::>>()?; + + RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns) +} diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index c0208b23974bd..af7df3dccfc71 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -17,5 +17,724 @@ //! Aggregates functionalities +use crate::execution::context::TaskContext; +use crate::physical_plan::aggregates::hash::GroupedHashAggregateStream; +use crate::physical_plan::aggregates::no_grouping::NoGroupingAggregateStream; +use crate::physical_plan::metrics::{ + BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, +}; +use crate::physical_plan::{ + DisplayFormatType, Distribution, ExecutionPlan, Partitioning, + SendableRecordBatchStream, Statistics, +}; +use arrow::array::ArrayRef; +use arrow::datatypes::{Field, Schema, SchemaRef}; +use async_trait::async_trait; +use datafusion_common::Result; +use datafusion_expr::Accumulator; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::{ + expressions, AggregateExpr, PhysicalExpr, PhysicalSortExpr, +}; +use std::any::Any; +use std::sync::Arc; + +mod hash; +mod no_grouping; + pub use datafusion_expr::AggregateFunction; pub use datafusion_physical_expr::expressions::create_aggregate_expr; + +/// Hash aggregate modes +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum AggregateMode { + /// Partial aggregate that can be applied in parallel across input partitions + Partial, + /// Final aggregate that produces a single partition of output + Final, + /// Final aggregate that works on pre-partitioned data. + /// + /// This requires the invariant that all rows with a particular + /// grouping key are in the same partitions, such as is the case + /// with Hash repartitioning on the group keys. If a group key is + /// duplicated, duplicate groups would be produced + FinalPartitioned, +} + +/// Hash aggregate execution plan +#[derive(Debug)] +pub struct AggregateExec { + /// Aggregation mode (full, partial) + mode: AggregateMode, + /// Grouping expressions + group_expr: Vec<(Arc, String)>, + /// Aggregate expressions + aggr_expr: Vec>, + /// Input plan, could be a partial aggregate or the input to the aggregate + input: Arc, + /// Schema after the aggregate is applied + schema: SchemaRef, + /// Input schema before any aggregation is applied. For partial aggregate this will be the + /// same as input.schema() but for the final aggregate it will be the same as the input + /// to the partial aggregate + input_schema: SchemaRef, + /// Execution Metrics + metrics: ExecutionPlanMetricsSet, +} + +impl AggregateExec { + /// Create a new hash aggregate execution plan + pub fn try_new( + mode: AggregateMode, + group_expr: Vec<(Arc, String)>, + aggr_expr: Vec>, + input: Arc, + input_schema: SchemaRef, + ) -> Result { + let schema = create_schema(&input.schema(), &group_expr, &aggr_expr, mode)?; + + let schema = Arc::new(schema); + + Ok(AggregateExec { + mode, + group_expr, + aggr_expr, + input, + schema, + input_schema, + metrics: ExecutionPlanMetricsSet::new(), + }) + } + + /// Aggregation mode (full, partial) + pub fn mode(&self) -> &AggregateMode { + &self.mode + } + + /// Grouping expressions + pub fn group_expr(&self) -> &[(Arc, String)] { + &self.group_expr + } + + /// Grouping expressions as they occur in the output schema + pub fn output_group_expr(&self) -> Vec> { + // Update column indices. Since the group by columns come first in the output schema, their + // indices are simply 0..self.group_expr(len). + self.group_expr + .iter() + .enumerate() + .map(|(index, (_col, name))| { + Arc::new(expressions::Column::new(name, index)) as Arc + }) + .collect() + } + + /// Aggregate expressions + pub fn aggr_expr(&self) -> &[Arc] { + &self.aggr_expr + } + + /// Input plan + pub fn input(&self) -> &Arc { + &self.input + } + + /// Get the input schema before any aggregates are applied + pub fn input_schema(&self) -> SchemaRef { + self.input_schema.clone() + } +} + +#[async_trait] +impl ExecutionPlan for AggregateExec { + /// Return a reference to Any that can be used for down-casting + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + /// Get the output partitioning of this plan + fn output_partitioning(&self) -> Partitioning { + self.input.output_partitioning() + } + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + None + } + + fn required_child_distribution(&self) -> Distribution { + match &self.mode { + AggregateMode::Partial => Distribution::UnspecifiedDistribution, + AggregateMode::FinalPartitioned => Distribution::HashPartitioned( + self.group_expr.iter().map(|x| x.0.clone()).collect(), + ), + AggregateMode::Final => Distribution::SinglePartition, + } + } + + fn relies_on_input_order(&self) -> bool { + false + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + Ok(Arc::new(AggregateExec::try_new( + self.mode, + self.group_expr.clone(), + self.aggr_expr.clone(), + children[0].clone(), + self.input_schema.clone(), + )?)) + } + + async fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let input = self.input.execute(partition, context).await?; + let group_expr = self.group_expr.iter().map(|x| x.0.clone()).collect(); + + let baseline_metrics = BaselineMetrics::new(&self.metrics, partition); + + if self.group_expr.is_empty() { + Ok(Box::pin(NoGroupingAggregateStream::new( + self.mode, + self.schema.clone(), + self.aggr_expr.clone(), + input, + baseline_metrics, + )?)) + } else { + Ok(Box::pin(GroupedHashAggregateStream::new( + self.mode, + self.schema.clone(), + group_expr, + self.aggr_expr.clone(), + input, + baseline_metrics, + )?)) + } + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "HashAggregateExec: mode={:?}", self.mode)?; + let g: Vec = self + .group_expr + .iter() + .map(|(e, alias)| { + let e = e.to_string(); + if &e != alias { + format!("{} as {}", e, alias) + } else { + e + } + }) + .collect(); + write!(f, ", gby=[{}]", g.join(", "))?; + + let a: Vec = self + .aggr_expr + .iter() + .map(|agg| agg.name().to_string()) + .collect(); + write!(f, ", aggr=[{}]", a.join(", "))?; + } + } + Ok(()) + } + + fn statistics(&self) -> Statistics { + // TODO stats: group expressions: + // - once expressions will be able to compute their own stats, use it here + // - case where we group by on a column for which with have the `distinct` stat + // TODO stats: aggr expression: + // - aggregations somtimes also preserve invariants such as min, max... + match self.mode { + AggregateMode::Final | AggregateMode::FinalPartitioned + if self.group_expr.is_empty() => + { + Statistics { + num_rows: Some(1), + is_exact: true, + ..Default::default() + } + } + _ => Statistics::default(), + } + } +} + +fn create_schema( + input_schema: &Schema, + group_expr: &[(Arc, String)], + aggr_expr: &[Arc], + mode: AggregateMode, +) -> datafusion_common::Result { + let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len()); + for (expr, name) in group_expr { + fields.push(Field::new( + name, + expr.data_type(input_schema)?, + expr.nullable(input_schema)?, + )) + } + + match mode { + AggregateMode::Partial => { + // in partial mode, the fields of the accumulator's state + for expr in aggr_expr { + fields.extend(expr.state_fields()?.iter().cloned()) + } + } + AggregateMode::Final | AggregateMode::FinalPartitioned => { + // in final mode, the field with the final result of the accumulator + for expr in aggr_expr { + fields.push(expr.field()?) + } + } + } + + Ok(Schema::new(fields)) +} + +/// returns physical expressions to evaluate against a batch +/// The expressions are different depending on `mode`: +/// * Partial: AggregateExpr::expressions +/// * Final: columns of `AggregateExpr::state_fields()` +fn aggregate_expressions( + aggr_expr: &[Arc], + mode: &AggregateMode, + col_idx_base: usize, +) -> datafusion_common::Result>>> { + match mode { + AggregateMode::Partial => { + Ok(aggr_expr.iter().map(|agg| agg.expressions()).collect()) + } + // in this mode, we build the merge expressions of the aggregation + AggregateMode::Final | AggregateMode::FinalPartitioned => { + let mut col_idx_base = col_idx_base; + Ok(aggr_expr + .iter() + .map(|agg| { + let exprs = merge_expressions(col_idx_base, agg)?; + col_idx_base += exprs.len(); + Ok(exprs) + }) + .collect::>>()?) + } + } +} + +/// uses `state_fields` to build a vec of physical column expressions required to merge the +/// AggregateExpr' accumulator's state. +/// +/// `index_base` is the starting physical column index for the next expanded state field. +fn merge_expressions( + index_base: usize, + expr: &Arc, +) -> Result>> { + Ok(expr + .state_fields()? + .iter() + .enumerate() + .map(|(idx, f)| { + Arc::new(Column::new(f.name(), index_base + idx)) as Arc + }) + .collect::>()) +} + +pub(crate) type AccumulatorItem = Box; + +fn create_accumulators( + aggr_expr: &[Arc], +) -> datafusion_common::Result> { + aggr_expr + .iter() + .map(|expr| expr.create_accumulator()) + .collect::>>() +} + +/// returns a vector of ArrayRefs, where each entry corresponds to either the +/// final value (mode = Final) or states (mode = Partial) +fn finalize_aggregation( + accumulators: &[AccumulatorItem], + mode: &AggregateMode, +) -> datafusion_common::Result> { + match mode { + AggregateMode::Partial => { + // build the vector of states + let a = accumulators + .iter() + .map(|accumulator| accumulator.state()) + .map(|value| { + value.map(|e| { + e.iter().map(|v| v.to_array()).collect::>() + }) + }) + .collect::>>()?; + Ok(a.iter().flatten().cloned().collect::>()) + } + AggregateMode::Final | AggregateMode::FinalPartitioned => { + // merge the state to the final value + accumulators + .iter() + .map(|accumulator| accumulator.evaluate().map(|v| v.to_array())) + .collect::>>() + } + } +} + +#[cfg(test)] +mod tests { + use crate::execution::context::TaskContext; + use crate::from_slice::FromSlice; + use crate::physical_plan::aggregates::{AggregateExec, AggregateMode}; + use crate::physical_plan::expressions::{col, Avg}; + use crate::test::assert_is_pending; + use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::{assert_batches_sorted_eq, physical_plan::common}; + use arrow::array::{Float64Array, UInt32Array}; + use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; + use arrow::error::Result as ArrowResult; + use arrow::record_batch::RecordBatch; + use async_trait::async_trait; + use datafusion_common::{DataFusionError, Result}; + use datafusion_physical_expr::{AggregateExpr, PhysicalExpr, PhysicalSortExpr}; + use futures::{FutureExt, Stream}; + use std::any::Any; + use std::sync::Arc; + use std::task::{Context, Poll}; + + use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; + use crate::physical_plan::{ + ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, + Statistics, + }; + use crate::prelude::SessionContext; + + /// some mock data to aggregates + fn some_data() -> (Arc, Vec) { + // define a schema. + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::Float64, false), + ])); + + // define data. + ( + schema.clone(), + vec![ + RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt32Array::from_slice(&[2, 3, 4, 4])), + Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])), + ], + ) + .unwrap(), + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt32Array::from_slice(&[2, 3, 3, 4])), + Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])), + ], + ) + .unwrap(), + ], + ) + } + + /// build the aggregates on the data from some_data() and check the results + async fn check_aggregates(input: Arc) -> Result<()> { + let input_schema = input.schema(); + + let groups: Vec<(Arc, String)> = + vec![(col("a", &input_schema)?, "a".to_string())]; + + let aggregates: Vec> = vec![Arc::new(Avg::new( + col("b", &input_schema)?, + "AVG(b)".to_string(), + DataType::Float64, + ))]; + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + + let partial_aggregate = Arc::new(AggregateExec::try_new( + AggregateMode::Partial, + groups.clone(), + aggregates.clone(), + input, + input_schema.clone(), + )?); + + let result = + common::collect(partial_aggregate.execute(0, task_ctx.clone()).await?) + .await?; + + let expected = vec![ + "+---+---------------+-------------+", + "| a | AVG(b)[count] | AVG(b)[sum] |", + "+---+---------------+-------------+", + "| 2 | 2 | 2 |", + "| 3 | 3 | 7 |", + "| 4 | 3 | 11 |", + "+---+---------------+-------------+", + ]; + assert_batches_sorted_eq!(expected, &result); + + let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); + + let final_group: Vec> = (0..groups.len()) + .map(|i| col(&groups[i].1, &input_schema)) + .collect::>()?; + + let merged_aggregate = Arc::new(AggregateExec::try_new( + AggregateMode::Final, + final_group + .iter() + .enumerate() + .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) + .collect(), + aggregates, + merge, + input_schema, + )?); + + let result = + common::collect(merged_aggregate.execute(0, task_ctx.clone()).await?).await?; + assert_eq!(result.len(), 1); + + let batch = &result[0]; + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 3); + + let expected = vec![ + "+---+--------------------+", + "| a | AVG(b) |", + "+---+--------------------+", + "| 2 | 1 |", + "| 3 | 2.3333333333333335 |", // 3, (2 + 3 + 2) / 3 + "| 4 | 3.6666666666666665 |", // 4, (3 + 4 + 4) / 3 + "+---+--------------------+", + ]; + + assert_batches_sorted_eq!(&expected, &result); + + let metrics = merged_aggregate.metrics().unwrap(); + let output_rows = metrics.output_rows().unwrap(); + assert_eq!(3, output_rows); + + Ok(()) + } + + /// Define a test source that can yield back to runtime before returning its first item /// + + #[derive(Debug)] + struct TestYieldingExec { + /// True if this exec should yield back to runtime the first time it is polled + pub yield_first: bool, + } + + #[async_trait] + impl ExecutionPlan for TestYieldingExec { + fn as_any(&self) -> &dyn Any { + self + } + fn schema(&self) -> SchemaRef { + some_data().0 + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(1) + } + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + None + } + + fn children(&self) -> Vec> { + vec![] + } + + fn with_new_children( + self: Arc, + _: Vec>, + ) -> Result> { + Err(DataFusionError::Internal(format!( + "Children cannot be replaced in {:?}", + self + ))) + } + + async fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + let stream = if self.yield_first { + TestYieldingStream::New + } else { + TestYieldingStream::Yielded + }; + + Ok(Box::pin(stream)) + } + + fn statistics(&self) -> Statistics { + let (_, batches) = some_data(); + common::compute_record_batch_statistics(&[batches], &self.schema(), None) + } + } + + /// A stream using the demo data. If inited as new, it will first yield to runtime before returning records + enum TestYieldingStream { + New, + Yielded, + ReturnedBatch1, + ReturnedBatch2, + } + + impl Stream for TestYieldingStream { + type Item = ArrowResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + match &*self { + TestYieldingStream::New => { + *(self.as_mut()) = TestYieldingStream::Yielded; + cx.waker().wake_by_ref(); + Poll::Pending + } + TestYieldingStream::Yielded => { + *(self.as_mut()) = TestYieldingStream::ReturnedBatch1; + Poll::Ready(Some(Ok(some_data().1[0].clone()))) + } + TestYieldingStream::ReturnedBatch1 => { + *(self.as_mut()) = TestYieldingStream::ReturnedBatch2; + Poll::Ready(Some(Ok(some_data().1[1].clone()))) + } + TestYieldingStream::ReturnedBatch2 => Poll::Ready(None), + } + } + } + + impl RecordBatchStream for TestYieldingStream { + fn schema(&self) -> SchemaRef { + some_data().0 + } + } + + //// Tests //// + + #[tokio::test] + async fn aggregate_source_not_yielding() -> Result<()> { + let input: Arc = + Arc::new(TestYieldingExec { yield_first: false }); + + check_aggregates(input).await + } + + #[tokio::test] + async fn aggregate_source_with_yielding() -> Result<()> { + let input: Arc = + Arc::new(TestYieldingExec { yield_first: true }); + + check_aggregates(input).await + } + + #[tokio::test] + async fn test_drop_cancel_without_groups() -> Result<()> { + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let schema = + Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)])); + + let groups = vec![]; + + let aggregates: Vec> = vec![Arc::new(Avg::new( + col("a", &schema)?, + "AVG(a)".to_string(), + DataType::Float64, + ))]; + + let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); + let refs = blocking_exec.refs(); + let hash_aggregate_exec = Arc::new(AggregateExec::try_new( + AggregateMode::Partial, + groups.clone(), + aggregates.clone(), + blocking_exec, + schema, + )?); + + let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx); + let mut fut = fut.boxed(); + + assert_is_pending(&mut fut); + drop(fut); + assert_strong_count_converges_to_zero(refs).await; + + Ok(()) + } + + #[tokio::test] + async fn test_drop_cancel_with_groups() -> Result<()> { + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Float32, true), + Field::new("b", DataType::Float32, true), + ])); + + let groups: Vec<(Arc, String)> = + vec![(col("a", &schema)?, "a".to_string())]; + + let aggregates: Vec> = vec![Arc::new(Avg::new( + col("b", &schema)?, + "AVG(b)".to_string(), + DataType::Float64, + ))]; + + let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); + let refs = blocking_exec.refs(); + let hash_aggregate_exec = Arc::new(AggregateExec::try_new( + AggregateMode::Partial, + groups.clone(), + aggregates.clone(), + blocking_exec, + schema, + )?); + + let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx); + let mut fut = fut.boxed(); + + assert_is_pending(&mut fut); + drop(fut); + assert_strong_count_converges_to_zero(refs).await; + + Ok(()) + } +} diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs new file mode 100644 index 0000000000000..3398eba3cfa82 --- /dev/null +++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Aggregate without grouping columns + +use crate::physical_plan::aggregates::{ + aggregate_expressions, create_accumulators, finalize_aggregation, AccumulatorItem, + AggregateMode, +}; +use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; +use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; +use arrow::datatypes::SchemaRef; +use arrow::error::{ArrowError, Result as ArrowResult}; +use arrow::record_batch::RecordBatch; +use datafusion_common::Result; +use datafusion_physical_expr::{AggregateExpr, PhysicalExpr}; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use futures::{ + ready, + stream::{Stream, StreamExt}, +}; + +/// stream struct for aggregation without grouping columns +pub(crate) struct NoGroupingAggregateStream { + schema: SchemaRef, + mode: AggregateMode, + input: SendableRecordBatchStream, + baseline_metrics: BaselineMetrics, + aggregate_expressions: Vec>>, + accumulators: Vec, + finished: bool, +} + +impl NoGroupingAggregateStream { + /// Create a new HashAggregateStream + pub fn new( + mode: AggregateMode, + schema: SchemaRef, + aggr_expr: Vec>, + input: SendableRecordBatchStream, + baseline_metrics: BaselineMetrics, + ) -> datafusion_common::Result { + let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode, 0)?; + let accumulators = create_accumulators(&aggr_expr)?; + + Ok(Self { + schema, + mode, + input, + baseline_metrics, + aggregate_expressions, + accumulators, + finished: false, + }) + } +} + +impl Stream for NoGroupingAggregateStream { + type Item = ArrowResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let this = &mut *self; + if this.finished { + return Poll::Ready(None); + } + + let elapsed_compute = this.baseline_metrics.elapsed_compute(); + + loop { + let result = match ready!(this.input.poll_next_unpin(cx)) { + Some(Ok(batch)) => { + let timer = elapsed_compute.timer(); + let result = aggregate_batch( + &this.mode, + &batch, + &mut this.accumulators, + &this.aggregate_expressions, + ); + + timer.done(); + + match result { + Ok(_) => continue, + Err(e) => Err(ArrowError::ExternalError(Box::new(e))), + } + } + Some(Err(e)) => Err(e), + None => { + this.finished = true; + let timer = this.baseline_metrics.elapsed_compute().timer(); + let result = finalize_aggregation(&this.accumulators, &this.mode) + .map_err(|e| ArrowError::ExternalError(Box::new(e))) + .and_then(|columns| { + RecordBatch::try_new(this.schema.clone(), columns) + }) + .record_output(&this.baseline_metrics); + + timer.done(); + result + } + }; + + this.finished = true; + return Poll::Ready(Some(result)); + } + } +} + +impl RecordBatchStream for NoGroupingAggregateStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +/// TODO: Make this a member function +fn aggregate_batch( + mode: &AggregateMode, + batch: &RecordBatch, + accumulators: &mut [AccumulatorItem], + expressions: &[Vec>], +) -> Result<()> { + // 1.1 iterate accumulators and respective expressions together + // 1.2 evaluate expressions + // 1.3 update / merge accumulators with the expressions' values + + // 1.1 + accumulators + .iter_mut() + .zip(expressions) + .try_for_each(|(accum, expr)| { + // 1.2 + let values = &expr + .iter() + .map(|e| e.evaluate(batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>()?; + + // 1.3 + match mode { + AggregateMode::Partial => accum.update_batch(values), + AggregateMode::Final | AggregateMode::FinalPartitioned => { + accum.merge_batch(values) + } + } + }) +} diff --git a/datafusion/core/src/physical_plan/hash_aggregate.rs b/datafusion/core/src/physical_plan/hash_aggregate.rs deleted file mode 100644 index 6431745579975..0000000000000 --- a/datafusion/core/src/physical_plan/hash_aggregate.rs +++ /dev/null @@ -1,1299 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines the execution plan for the hash aggregate operation - -use std::any::Any; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::vec; - -use ahash::RandomState; -use futures::{ - ready, - stream::{Stream, StreamExt}, -}; - -use crate::error::Result; -use crate::physical_plan::hash_utils::create_hashes; -use crate::physical_plan::{ - Accumulator, AggregateExpr, DisplayFormatType, Distribution, ExecutionPlan, - Partitioning, PhysicalExpr, -}; -use crate::scalar::ScalarValue; - -use arrow::{array::ArrayRef, compute, compute::cast}; -use arrow::{ - array::{Array, UInt32Builder}, - error::{ArrowError, Result as ArrowResult}, -}; -use arrow::{ - datatypes::{Field, Schema, SchemaRef}, - record_batch::RecordBatch, -}; -use hashbrown::raw::RawTable; - -use crate::execution::context::TaskContext; -use async_trait::async_trait; - -use super::expressions::PhysicalSortExpr; -use super::metrics::{ - BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput, -}; -use super::Statistics; -use super::{expressions::Column, RecordBatchStream, SendableRecordBatchStream}; - -/// Hash aggregate modes -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub enum AggregateMode { - /// Partial aggregate that can be applied in parallel across input partitions - Partial, - /// Final aggregate that produces a single partition of output - Final, - /// Final aggregate that works on pre-partitioned data. - /// - /// This requires the invariant that all rows with a particular - /// grouping key are in the same partitions, such as is the case - /// with Hash repartitioning on the group keys. If a group key is - /// duplicated, duplicate groups would be produced - FinalPartitioned, -} - -/// Hash aggregate execution plan -#[derive(Debug)] -pub struct HashAggregateExec { - /// Aggregation mode (full, partial) - mode: AggregateMode, - /// Grouping expressions - group_expr: Vec<(Arc, String)>, - /// Aggregate expressions - aggr_expr: Vec>, - /// Input plan, could be a partial aggregate or the input to the aggregate - input: Arc, - /// Schema after the aggregate is applied - schema: SchemaRef, - /// Input schema before any aggregation is applied. For partial aggregate this will be the - /// same as input.schema() but for the final aggregate it will be the same as the input - /// to the partial aggregate - input_schema: SchemaRef, - /// Execution Metrics - metrics: ExecutionPlanMetricsSet, -} - -fn create_schema( - input_schema: &Schema, - group_expr: &[(Arc, String)], - aggr_expr: &[Arc], - mode: AggregateMode, -) -> Result { - let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len()); - for (expr, name) in group_expr { - fields.push(Field::new( - name, - expr.data_type(input_schema)?, - expr.nullable(input_schema)?, - )) - } - - match mode { - AggregateMode::Partial => { - // in partial mode, the fields of the accumulator's state - for expr in aggr_expr { - fields.extend(expr.state_fields()?.iter().cloned()) - } - } - AggregateMode::Final | AggregateMode::FinalPartitioned => { - // in final mode, the field with the final result of the accumulator - for expr in aggr_expr { - fields.push(expr.field()?) - } - } - } - - Ok(Schema::new(fields)) -} - -impl HashAggregateExec { - /// Create a new hash aggregate execution plan - pub fn try_new( - mode: AggregateMode, - group_expr: Vec<(Arc, String)>, - aggr_expr: Vec>, - input: Arc, - input_schema: SchemaRef, - ) -> Result { - let schema = create_schema(&input.schema(), &group_expr, &aggr_expr, mode)?; - - let schema = Arc::new(schema); - - Ok(HashAggregateExec { - mode, - group_expr, - aggr_expr, - input, - schema, - input_schema, - metrics: ExecutionPlanMetricsSet::new(), - }) - } - - /// Aggregation mode (full, partial) - pub fn mode(&self) -> &AggregateMode { - &self.mode - } - - /// Grouping expressions - pub fn group_expr(&self) -> &[(Arc, String)] { - &self.group_expr - } - - /// Grouping expressions as they occur in the output schema - pub fn output_group_expr(&self) -> Vec> { - // Update column indices. Since the group by columns come first in the output schema, their - // indices are simply 0..self.group_expr(len). - self.group_expr - .iter() - .enumerate() - .map(|(index, (_col, name))| { - Arc::new(Column::new(name, index)) as Arc - }) - .collect() - } - - /// Aggregate expressions - pub fn aggr_expr(&self) -> &[Arc] { - &self.aggr_expr - } - - /// Input plan - pub fn input(&self) -> &Arc { - &self.input - } - - /// Get the input schema before any aggregates are applied - pub fn input_schema(&self) -> SchemaRef { - self.input_schema.clone() - } -} - -#[async_trait] -impl ExecutionPlan for HashAggregateExec { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn children(&self) -> Vec> { - vec![self.input.clone()] - } - - fn required_child_distribution(&self) -> Distribution { - match &self.mode { - AggregateMode::Partial => Distribution::UnspecifiedDistribution, - AggregateMode::FinalPartitioned => Distribution::HashPartitioned( - self.group_expr.iter().map(|x| x.0.clone()).collect(), - ), - AggregateMode::Final => Distribution::SinglePartition, - } - } - - /// Get the output partitioning of this plan - fn output_partitioning(&self) -> Partitioning { - self.input.output_partitioning() - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - fn relies_on_input_order(&self) -> bool { - false - } - - async fn execute( - &self, - partition: usize, - context: Arc, - ) -> Result { - let input = self.input.execute(partition, context).await?; - let group_expr = self.group_expr.iter().map(|x| x.0.clone()).collect(); - - let baseline_metrics = BaselineMetrics::new(&self.metrics, partition); - - if self.group_expr.is_empty() { - Ok(Box::pin(HashAggregateStream::new( - self.mode, - self.schema.clone(), - self.aggr_expr.clone(), - input, - baseline_metrics, - )?)) - } else { - Ok(Box::pin(GroupedHashAggregateStream::new( - self.mode, - self.schema.clone(), - group_expr, - self.aggr_expr.clone(), - input, - baseline_metrics, - )?)) - } - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> Result> { - Ok(Arc::new(HashAggregateExec::try_new( - self.mode, - self.group_expr.clone(), - self.aggr_expr.clone(), - children[0].clone(), - self.input_schema.clone(), - )?)) - } - - fn metrics(&self) -> Option { - Some(self.metrics.clone_inner()) - } - - fn fmt_as( - &self, - t: DisplayFormatType, - f: &mut std::fmt::Formatter, - ) -> std::fmt::Result { - match t { - DisplayFormatType::Default => { - write!(f, "HashAggregateExec: mode={:?}", self.mode)?; - let g: Vec = self - .group_expr - .iter() - .map(|(e, alias)| { - let e = e.to_string(); - if &e != alias { - format!("{} as {}", e, alias) - } else { - e - } - }) - .collect(); - write!(f, ", gby=[{}]", g.join(", "))?; - - let a: Vec = self - .aggr_expr - .iter() - .map(|agg| agg.name().to_string()) - .collect(); - write!(f, ", aggr=[{}]", a.join(", "))?; - } - } - Ok(()) - } - - fn statistics(&self) -> Statistics { - // TODO stats: group expressions: - // - once expressions will be able to compute their own stats, use it here - // - case where we group by on a column for which with have the `distinct` stat - // TODO stats: aggr expression: - // - aggregations somtimes also preserve invariants such as min, max... - match self.mode { - AggregateMode::Final | AggregateMode::FinalPartitioned - if self.group_expr.is_empty() => - { - Statistics { - num_rows: Some(1), - is_exact: true, - ..Default::default() - } - } - _ => Statistics::default(), - } - } -} - -/* -The architecture is the following: - -1. An accumulator has state that is updated on each batch. -2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row -3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch. -4. The state's RecordBatch is `merge`d to a new state -5. The state is mapped to the final value - -Why: - -* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array` -* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge -* It uses Arrow's native dynamically typed object, `Array`. -* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant. - -Example: average - -* the state is `n: u32` and `sum: f64` -* For every batch, we update them accordingly. -* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]` -* The RecordBatch is (sent back / transmitted over network) -* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns. -* Finally, `get_value` returns an array with one entry computed from the state -*/ -struct GroupedHashAggregateStream { - schema: SchemaRef, - input: SendableRecordBatchStream, - mode: AggregateMode, - accumulators: Accumulators, - aggregate_expressions: Vec>>, - - aggr_expr: Vec>, - group_expr: Vec>, - - baseline_metrics: BaselineMetrics, - random_state: RandomState, - finished: bool, -} - -impl GroupedHashAggregateStream { - /// Create a new HashAggregateStream - pub fn new( - mode: AggregateMode, - schema: SchemaRef, - group_expr: Vec>, - aggr_expr: Vec>, - input: SendableRecordBatchStream, - baseline_metrics: BaselineMetrics, - ) -> Result { - let timer = baseline_metrics.elapsed_compute().timer(); - - // The expressions to evaluate the batch, one vec of expressions per aggregation. - // Assume create_schema() always put group columns in front of aggr columns, we set - // col_idx_base to group expression count. - let aggregate_expressions = - aggregate_expressions(&aggr_expr, &mode, group_expr.len())?; - - timer.done(); - - Ok(Self { - schema, - mode, - input, - aggr_expr, - group_expr, - baseline_metrics, - aggregate_expressions, - accumulators: Default::default(), - random_state: Default::default(), - finished: false, - }) - } -} - -impl Stream for GroupedHashAggregateStream { - type Item = ArrowResult; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - let this = &mut *self; - if this.finished { - return Poll::Ready(None); - } - - let elapsed_compute = this.baseline_metrics.elapsed_compute(); - - loop { - let result = match ready!(this.input.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - let timer = elapsed_compute.timer(); - let result = group_aggregate_batch( - &this.mode, - &this.random_state, - &this.group_expr, - &this.aggr_expr, - batch, - &mut this.accumulators, - &this.aggregate_expressions, - ); - - timer.done(); - - match result { - Ok(_) => continue, - Err(e) => Err(ArrowError::ExternalError(Box::new(e))), - } - } - Some(Err(e)) => Err(e), - None => { - this.finished = true; - let timer = this.baseline_metrics.elapsed_compute().timer(); - let result = create_batch_from_map( - &this.mode, - &this.accumulators, - this.group_expr.len(), - &this.schema, - ) - .record_output(&this.baseline_metrics); - - timer.done(); - result - } - }; - - this.finished = true; - return Poll::Ready(Some(result)); - } - } -} - -impl RecordBatchStream for GroupedHashAggregateStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -/// TODO: Make this a member function of [`GroupedHashAggregateStream`] -fn group_aggregate_batch( - mode: &AggregateMode, - random_state: &RandomState, - group_expr: &[Arc], - aggr_expr: &[Arc], - batch: RecordBatch, - accumulators: &mut Accumulators, - aggregate_expressions: &[Vec>], -) -> Result<()> { - // evaluate the grouping expressions - let group_values = evaluate(group_expr, &batch)?; - - // evaluate the aggregation expressions. - // We could evaluate them after the `take`, but since we need to evaluate all - // of them anyways, it is more performant to do it while they are together. - let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?; - - // 1.1 construct the key from the group values - // 1.2 construct the mapping key if it does not exist - // 1.3 add the row' index to `indices` - - // track which entries in `accumulators` have rows in this batch to aggregate - let mut groups_with_rows = vec![]; - - // 1.1 Calculate the group keys for the group values - let mut batch_hashes = vec![0; batch.num_rows()]; - create_hashes(&group_values, random_state, &mut batch_hashes)?; - - for (row, hash) in batch_hashes.into_iter().enumerate() { - let Accumulators { map, group_states } = accumulators; - - let entry = map.get_mut(hash, |(_hash, group_idx)| { - // verify that a group that we are inserting with hash is - // actually the same key value as the group in - // existing_idx (aka group_values @ row) - let group_state = &group_states[*group_idx]; - group_values - .iter() - .zip(group_state.group_by_values.iter()) - .all(|(array, scalar)| scalar.eq_array(array, row)) - }); - - match entry { - // Existing entry for this group value - Some((_hash, group_idx)) => { - let group_state = &mut group_states[*group_idx]; - // 1.3 - if group_state.indices.is_empty() { - groups_with_rows.push(*group_idx); - }; - group_state.indices.push(row as u32); // remember this row - } - // 1.2 Need to create new entry - None => { - let accumulator_set = create_accumulators(aggr_expr)?; - - // Copy group values out of arrays into `ScalarValue`s - let group_by_values = group_values - .iter() - .map(|col| ScalarValue::try_from_array(col, row)) - .collect::>>()?; - - // Add new entry to group_states and save newly created index - let group_state = GroupState { - group_by_values: group_by_values.into_boxed_slice(), - accumulator_set, - indices: vec![row as u32], // 1.3 - }; - let group_idx = group_states.len(); - group_states.push(group_state); - groups_with_rows.push(group_idx); - - // for hasher function, use precomputed hash value - map.insert(hash, (hash, group_idx), |(hash, _group_idx)| *hash); - } - }; - } - - // Collect all indices + offsets based on keys in this vec - let mut batch_indices: UInt32Builder = UInt32Builder::new(0); - let mut offsets = vec![0]; - let mut offset_so_far = 0; - for group_idx in groups_with_rows.iter() { - let indices = &accumulators.group_states[*group_idx].indices; - batch_indices.append_slice(indices)?; - offset_so_far += indices.len(); - offsets.push(offset_so_far); - } - let batch_indices = batch_indices.finish(); - - // `Take` all values based on indices into Arrays - let values: Vec>> = aggr_input_values - .iter() - .map(|array| { - array - .iter() - .map(|array| { - compute::take( - array.as_ref(), - &batch_indices, - None, // None: no index check - ) - .unwrap() - }) - .collect() - // 2.3 - }) - .collect(); - - // 2.1 for each key in this batch - // 2.2 for each aggregation - // 2.3 `slice` from each of its arrays the keys' values - // 2.4 update / merge the accumulator with the values - // 2.5 clear indices - groups_with_rows - .iter() - .zip(offsets.windows(2)) - .try_for_each(|(group_idx, offsets)| { - let group_state = &mut accumulators.group_states[*group_idx]; - // 2.2 - group_state - .accumulator_set - .iter_mut() - .zip(values.iter()) - .map(|(accumulator, aggr_array)| { - ( - accumulator, - aggr_array - .iter() - .map(|array| { - // 2.3 - array.slice(offsets[0], offsets[1] - offsets[0]) - }) - .collect::>(), - ) - }) - .try_for_each(|(accumulator, values)| match mode { - AggregateMode::Partial => accumulator.update_batch(&values), - AggregateMode::FinalPartitioned | AggregateMode::Final => { - // note: the aggregation here is over states, not values, thus the merge - accumulator.merge_batch(&values) - } - }) - // 2.5 - .and({ - group_state.indices.clear(); - Ok(()) - }) - })?; - - Ok(()) -} - -type AccumulatorItem = Box; - -/// The state that is built for each output group. -#[derive(Debug)] -struct GroupState { - /// The actual group by values, one for each group column - group_by_values: Box<[ScalarValue]>, - - // Accumulator state, one for each aggregate - accumulator_set: Vec, - - /// scratch space used to collect indices for input rows in a - /// bach that have values to aggregate. Reset on each batch - indices: Vec, -} - -/// The state of all the groups -#[derive(Default)] -struct Accumulators { - /// Logically maps group values to an index in `group_states` - /// - /// Uses the raw API of hashbrown to avoid actually storing the - /// keys in the table - /// - /// keys: u64 hashes of the GroupValue - /// values: (hash, index into `group_states`) - map: RawTable<(u64, usize)>, - - /// State for each group - group_states: Vec, -} - -impl std::fmt::Debug for Accumulators { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - // hashes are not store inline, so could only get values - let map_string = "RawTable"; - f.debug_struct("Accumulators") - .field("map", &map_string) - .field("group_states", &self.group_states) - .finish() - } -} - -/// Evaluates expressions against a record batch. -fn evaluate( - expr: &[Arc], - batch: &RecordBatch, -) -> Result> { - expr.iter() - .map(|expr| expr.evaluate(batch)) - .map(|r| r.map(|v| v.into_array(batch.num_rows()))) - .collect::>>() -} - -/// Evaluates expressions against a record batch. -fn evaluate_many( - expr: &[Vec>], - batch: &RecordBatch, -) -> Result>> { - expr.iter() - .map(|expr| evaluate(expr, batch)) - .collect::>>() -} - -/// uses `state_fields` to build a vec of physical column expressions required to merge the -/// AggregateExpr' accumulator's state. -/// -/// `index_base` is the starting physical column index for the next expanded state field. -fn merge_expressions( - index_base: usize, - expr: &Arc, -) -> Result>> { - Ok(expr - .state_fields()? - .iter() - .enumerate() - .map(|(idx, f)| { - Arc::new(Column::new(f.name(), index_base + idx)) as Arc - }) - .collect::>()) -} - -/// returns physical expressions to evaluate against a batch -/// The expressions are different depending on `mode`: -/// * Partial: AggregateExpr::expressions -/// * Final: columns of `AggregateExpr::state_fields()` -fn aggregate_expressions( - aggr_expr: &[Arc], - mode: &AggregateMode, - col_idx_base: usize, -) -> Result>>> { - match mode { - AggregateMode::Partial => { - Ok(aggr_expr.iter().map(|agg| agg.expressions()).collect()) - } - // in this mode, we build the merge expressions of the aggregation - AggregateMode::Final | AggregateMode::FinalPartitioned => { - let mut col_idx_base = col_idx_base; - Ok(aggr_expr - .iter() - .map(|agg| { - let exprs = merge_expressions(col_idx_base, agg)?; - col_idx_base += exprs.len(); - Ok(exprs) - }) - .collect::>>()?) - } - } -} - -/// stream struct for hash aggregation -pub struct HashAggregateStream { - schema: SchemaRef, - mode: AggregateMode, - input: SendableRecordBatchStream, - baseline_metrics: BaselineMetrics, - aggregate_expressions: Vec>>, - accumulators: Vec, - finished: bool, -} - -impl HashAggregateStream { - /// Create a new HashAggregateStream - pub fn new( - mode: AggregateMode, - schema: SchemaRef, - aggr_expr: Vec>, - input: SendableRecordBatchStream, - baseline_metrics: BaselineMetrics, - ) -> Result { - let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode, 0)?; - let accumulators = create_accumulators(&aggr_expr)?; - - Ok(Self { - schema, - mode, - input, - baseline_metrics, - aggregate_expressions, - accumulators, - finished: false, - }) - } -} - -/// TODO: Make this a member function -fn aggregate_batch( - mode: &AggregateMode, - batch: &RecordBatch, - accumulators: &mut [AccumulatorItem], - expressions: &[Vec>], -) -> Result<()> { - // 1.1 iterate accumulators and respective expressions together - // 1.2 evaluate expressions - // 1.3 update / merge accumulators with the expressions' values - - // 1.1 - accumulators - .iter_mut() - .zip(expressions) - .try_for_each(|(accum, expr)| { - // 1.2 - let values = &expr - .iter() - .map(|e| e.evaluate(batch)) - .map(|r| r.map(|v| v.into_array(batch.num_rows()))) - .collect::>>()?; - - // 1.3 - match mode { - AggregateMode::Partial => accum.update_batch(values), - AggregateMode::Final | AggregateMode::FinalPartitioned => { - accum.merge_batch(values) - } - } - }) -} - -impl Stream for HashAggregateStream { - type Item = ArrowResult; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - let this = &mut *self; - if this.finished { - return Poll::Ready(None); - } - - let elapsed_compute = this.baseline_metrics.elapsed_compute(); - - loop { - let result = match ready!(this.input.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - let timer = elapsed_compute.timer(); - let result = aggregate_batch( - &this.mode, - &batch, - &mut this.accumulators, - &this.aggregate_expressions, - ); - - timer.done(); - - match result { - Ok(_) => continue, - Err(e) => Err(ArrowError::ExternalError(Box::new(e))), - } - } - Some(Err(e)) => Err(e), - None => { - this.finished = true; - let timer = this.baseline_metrics.elapsed_compute().timer(); - let result = finalize_aggregation(&this.accumulators, &this.mode) - .map_err(|e| ArrowError::ExternalError(Box::new(e))) - .and_then(|columns| { - RecordBatch::try_new(this.schema.clone(), columns) - }) - .record_output(&this.baseline_metrics); - - timer.done(); - result - } - }; - - this.finished = true; - return Poll::Ready(Some(result)); - } - } -} - -impl RecordBatchStream for HashAggregateStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -/// Create a RecordBatch with all group keys and accumulator' states or values. -fn create_batch_from_map( - mode: &AggregateMode, - accumulators: &Accumulators, - num_group_expr: usize, - output_schema: &Schema, -) -> ArrowResult { - if accumulators.group_states.is_empty() { - return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned()))); - } - let accs = &accumulators.group_states[0].accumulator_set; - let mut acc_data_types: Vec = vec![]; - - // Calculate number/shape of state arrays - match mode { - AggregateMode::Partial => { - for acc in accs.iter() { - let state = acc.state()?; - acc_data_types.push(state.len()); - } - } - AggregateMode::Final | AggregateMode::FinalPartitioned => { - acc_data_types = vec![1; accs.len()]; - } - } - - let mut columns = (0..num_group_expr) - .map(|i| { - ScalarValue::iter_to_array( - accumulators - .group_states - .iter() - .map(|group_state| group_state.group_by_values[i].clone()), - ) - }) - .collect::>>()?; - - // add state / evaluated arrays - for (x, &state_len) in acc_data_types.iter().enumerate() { - for y in 0..state_len { - match mode { - AggregateMode::Partial => { - let res = ScalarValue::iter_to_array( - accumulators.group_states.iter().map(|group_state| { - let x = group_state.accumulator_set[x].state().unwrap(); - x[y].clone() - }), - )?; - - columns.push(res); - } - AggregateMode::Final | AggregateMode::FinalPartitioned => { - let res = ScalarValue::iter_to_array( - accumulators.group_states.iter().map(|group_state| { - group_state.accumulator_set[x].evaluate().unwrap() - }), - )?; - columns.push(res); - } - } - } - } - - // cast output if needed (e.g. for types like Dictionary where - // the intermediate GroupByScalar type was not the same as the - // output - let columns = columns - .iter() - .zip(output_schema.fields().iter()) - .map(|(col, desired_field)| cast(col, desired_field.data_type())) - .collect::>>()?; - - RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns) -} - -fn create_accumulators( - aggr_expr: &[Arc], -) -> Result> { - aggr_expr - .iter() - .map(|expr| expr.create_accumulator()) - .collect::>>() -} - -/// returns a vector of ArrayRefs, where each entry corresponds to either the -/// final value (mode = Final) or states (mode = Partial) -fn finalize_aggregation( - accumulators: &[AccumulatorItem], - mode: &AggregateMode, -) -> Result> { - match mode { - AggregateMode::Partial => { - // build the vector of states - let a = accumulators - .iter() - .map(|accumulator| accumulator.state()) - .map(|value| { - value.map(|e| { - e.iter().map(|v| v.to_array()).collect::>() - }) - }) - .collect::>>()?; - Ok(a.iter().flatten().cloned().collect::>()) - } - AggregateMode::Final | AggregateMode::FinalPartitioned => { - // merge the state to the final value - accumulators - .iter() - .map(|accumulator| accumulator.evaluate().map(|v| v.to_array())) - .collect::>>() - } - } -} - -#[cfg(test)] -mod tests { - - use super::*; - use crate::from_slice::FromSlice; - use crate::physical_plan::expressions::{col, Avg}; - use crate::test::assert_is_pending; - use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; - use crate::{assert_batches_sorted_eq, physical_plan::common}; - use arrow::array::{Float64Array, UInt32Array}; - use arrow::datatypes::DataType; - use datafusion_common::DataFusionError; - use futures::FutureExt; - - use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; - use crate::prelude::SessionContext; - - /// some mock data to aggregates - fn some_data() -> (Arc, Vec) { - // define a schema. - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::UInt32, false), - Field::new("b", DataType::Float64, false), - ])); - - // define data. - ( - schema.clone(), - vec![ - RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(UInt32Array::from_slice(&[2, 3, 4, 4])), - Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])), - ], - ) - .unwrap(), - RecordBatch::try_new( - schema, - vec![ - Arc::new(UInt32Array::from_slice(&[2, 3, 3, 4])), - Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])), - ], - ) - .unwrap(), - ], - ) - } - - /// build the aggregates on the data from some_data() and check the results - async fn check_aggregates(input: Arc) -> Result<()> { - let input_schema = input.schema(); - - let groups: Vec<(Arc, String)> = - vec![(col("a", &input_schema)?, "a".to_string())]; - - let aggregates: Vec> = vec![Arc::new(Avg::new( - col("b", &input_schema)?, - "AVG(b)".to_string(), - DataType::Float64, - ))]; - - let session_ctx = SessionContext::new(); - let task_ctx = session_ctx.task_ctx(); - - let partial_aggregate = Arc::new(HashAggregateExec::try_new( - AggregateMode::Partial, - groups.clone(), - aggregates.clone(), - input, - input_schema.clone(), - )?); - - let result = - common::collect(partial_aggregate.execute(0, task_ctx.clone()).await?) - .await?; - - let expected = vec![ - "+---+---------------+-------------+", - "| a | AVG(b)[count] | AVG(b)[sum] |", - "+---+---------------+-------------+", - "| 2 | 2 | 2 |", - "| 3 | 3 | 7 |", - "| 4 | 3 | 11 |", - "+---+---------------+-------------+", - ]; - assert_batches_sorted_eq!(expected, &result); - - let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); - - let final_group: Vec> = (0..groups.len()) - .map(|i| col(&groups[i].1, &input_schema)) - .collect::>()?; - - let merged_aggregate = Arc::new(HashAggregateExec::try_new( - AggregateMode::Final, - final_group - .iter() - .enumerate() - .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) - .collect(), - aggregates, - merge, - input_schema, - )?); - - let result = - common::collect(merged_aggregate.execute(0, task_ctx.clone()).await?).await?; - assert_eq!(result.len(), 1); - - let batch = &result[0]; - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 3); - - let expected = vec![ - "+---+--------------------+", - "| a | AVG(b) |", - "+---+--------------------+", - "| 2 | 1 |", - "| 3 | 2.3333333333333335 |", // 3, (2 + 3 + 2) / 3 - "| 4 | 3.6666666666666665 |", // 4, (3 + 4 + 4) / 3 - "+---+--------------------+", - ]; - - assert_batches_sorted_eq!(&expected, &result); - - let metrics = merged_aggregate.metrics().unwrap(); - let output_rows = metrics.output_rows().unwrap(); - assert_eq!(3, output_rows); - - Ok(()) - } - - /// Define a test source that can yield back to runtime before returning its first item /// - - #[derive(Debug)] - struct TestYieldingExec { - /// True if this exec should yield back to runtime the first time it is polled - pub yield_first: bool, - } - - #[async_trait] - impl ExecutionPlan for TestYieldingExec { - fn as_any(&self) -> &dyn Any { - self - } - fn schema(&self) -> SchemaRef { - some_data().0 - } - - fn children(&self) -> Vec> { - vec![] - } - - fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(1) - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - fn with_new_children( - self: Arc, - _: Vec>, - ) -> Result> { - Err(DataFusionError::Internal(format!( - "Children cannot be replaced in {:?}", - self - ))) - } - - async fn execute( - &self, - _partition: usize, - _context: Arc, - ) -> Result { - let stream = if self.yield_first { - TestYieldingStream::New - } else { - TestYieldingStream::Yielded - }; - - Ok(Box::pin(stream)) - } - - fn statistics(&self) -> Statistics { - let (_, batches) = some_data(); - common::compute_record_batch_statistics(&[batches], &self.schema(), None) - } - } - - /// A stream using the demo data. If inited as new, it will first yield to runtime before returning records - enum TestYieldingStream { - New, - Yielded, - ReturnedBatch1, - ReturnedBatch2, - } - - impl Stream for TestYieldingStream { - type Item = ArrowResult; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - match &*self { - TestYieldingStream::New => { - *(self.as_mut()) = TestYieldingStream::Yielded; - cx.waker().wake_by_ref(); - Poll::Pending - } - TestYieldingStream::Yielded => { - *(self.as_mut()) = TestYieldingStream::ReturnedBatch1; - Poll::Ready(Some(Ok(some_data().1[0].clone()))) - } - TestYieldingStream::ReturnedBatch1 => { - *(self.as_mut()) = TestYieldingStream::ReturnedBatch2; - Poll::Ready(Some(Ok(some_data().1[1].clone()))) - } - TestYieldingStream::ReturnedBatch2 => Poll::Ready(None), - } - } - } - - impl RecordBatchStream for TestYieldingStream { - fn schema(&self) -> SchemaRef { - some_data().0 - } - } - - //// Tests //// - - #[tokio::test] - async fn aggregate_source_not_yielding() -> Result<()> { - let input: Arc = - Arc::new(TestYieldingExec { yield_first: false }); - - check_aggregates(input).await - } - - #[tokio::test] - async fn aggregate_source_with_yielding() -> Result<()> { - let input: Arc = - Arc::new(TestYieldingExec { yield_first: true }); - - check_aggregates(input).await - } - - #[tokio::test] - async fn test_drop_cancel_without_groups() -> Result<()> { - let session_ctx = SessionContext::new(); - let task_ctx = session_ctx.task_ctx(); - let schema = - Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)])); - - let groups = vec![]; - - let aggregates: Vec> = vec![Arc::new(Avg::new( - col("a", &schema)?, - "AVG(a)".to_string(), - DataType::Float64, - ))]; - - let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); - let refs = blocking_exec.refs(); - let hash_aggregate_exec = Arc::new(HashAggregateExec::try_new( - AggregateMode::Partial, - groups.clone(), - aggregates.clone(), - blocking_exec, - schema, - )?); - - let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx); - let mut fut = fut.boxed(); - - assert_is_pending(&mut fut); - drop(fut); - assert_strong_count_converges_to_zero(refs).await; - - Ok(()) - } - - #[tokio::test] - async fn test_drop_cancel_with_groups() -> Result<()> { - let session_ctx = SessionContext::new(); - let task_ctx = session_ctx.task_ctx(); - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Float32, true), - Field::new("b", DataType::Float32, true), - ])); - - let groups: Vec<(Arc, String)> = - vec![(col("a", &schema)?, "a".to_string())]; - - let aggregates: Vec> = vec![Arc::new(Avg::new( - col("b", &schema)?, - "AVG(b)".to_string(), - DataType::Float64, - ))]; - - let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); - let refs = blocking_exec.refs(); - let hash_aggregate_exec = Arc::new(HashAggregateExec::try_new( - AggregateMode::Partial, - groups.clone(), - aggregates.clone(), - blocking_exec, - schema, - )?); - - let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx); - let mut fut = fut.boxed(); - - assert_is_pending(&mut fut); - drop(fut); - assert_strong_count_converges_to_zero(refs).await; - - Ok(()) - } -} diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index b7b25a636efc9..dc963c7e1bdc1 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -555,7 +555,6 @@ pub use datafusion_physical_expr::expressions; pub mod file_format; pub mod filter; pub mod functions; -pub mod hash_aggregate; pub mod hash_join; pub mod hash_utils; pub mod join_utils; diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 84785777b016c..966b973b37cde 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -34,6 +34,7 @@ use crate::logical_plan::{ }; use crate::logical_plan::{Limit, Values}; use crate::physical_optimizer::optimizer::PhysicalOptimizerRule; +use crate::physical_plan::aggregates::{AggregateExec, AggregateMode}; use crate::physical_plan::cross_join::CrossJoinExec; use crate::physical_plan::explain::ExplainExec; use crate::physical_plan::expressions; @@ -41,7 +42,6 @@ use crate::physical_plan::expressions::{ CaseExpr, Column, GetIndexedFieldExpr, Literal, PhysicalSortExpr, }; use crate::physical_plan::filter::FilterExec; -use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use crate::physical_plan::hash_join::HashJoinExec; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use crate::physical_plan::projection::ProjectionExec; @@ -524,7 +524,7 @@ impl DefaultPhysicalPlanner { }) .collect::>>()?; - let initial_aggr = Arc::new(HashAggregateExec::try_new( + let initial_aggr = Arc::new(AggregateExec::try_new( AggregateMode::Partial, groups.clone(), aggregates.clone(), @@ -566,7 +566,7 @@ impl DefaultPhysicalPlanner { (initial_aggr, AggregateMode::Final) }; - Ok(Arc::new(HashAggregateExec::try_new( + Ok(Arc::new(AggregateExec::try_new( next_partition_mode, final_group .iter() @@ -1839,7 +1839,7 @@ mod tests { let execution_plan = plan(&logical_plan).await?; let final_hash_agg = execution_plan .as_any() - .downcast_ref::() + .downcast_ref::() .expect("hash aggregate"); assert_eq!( "SUM(aggregate_test_100.c2)", diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index a124311aa4ff5..77287f566a6aa 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -110,7 +110,7 @@ async fn explain_analyze_baseline_metrics() { use datafusion::physical_plan::sorts; plan.as_any().downcast_ref::().is_some() - || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() // CoalescePartitionsExec doesn't do any work so is not included || plan.as_any().downcast_ref::().is_some() || plan.as_any().downcast_ref::().is_some() From fbeaf0b0ded82a72d3c3755eb1366af98f120556 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Fri, 29 Apr 2022 12:06:44 +0800 Subject: [PATCH 2/8] basic accumulators --- datafusion/core/Cargo.toml | 2 +- .../core/src/physical_plan/aggregates/hash.rs | 25 +- .../core/src/physical_plan/aggregates/mod.rs | 23 + .../src/physical_plan/aggregates/row_hash.rs | 412 ++++++++++++++++++ .../core/src/physical_plan/hash_utils.rs | 35 ++ datafusion/physical-expr/Cargo.toml | 4 +- .../physical-expr/src/aggregate/average.rs | 71 +++ .../physical-expr/src/aggregate/count.rs | 44 ++ .../physical-expr/src/aggregate/min_max.rs | 162 +++++++ datafusion/physical-expr/src/aggregate/mod.rs | 6 + .../src/aggregate/row_accumulator.rs | 39 ++ datafusion/physical-expr/src/aggregate/sum.rs | 141 +++++- datafusion/row/src/accessor.rs | 302 +++++++++++++ datafusion/row/src/layout.rs | 8 +- datafusion/row/src/lib.rs | 1 + datafusion/row/src/reader.rs | 3 + datafusion/row/src/writer.rs | 2 + 17 files changed, 1252 insertions(+), 28 deletions(-) create mode 100644 datafusion/core/src/physical_plan/aggregates/row_hash.rs create mode 100644 datafusion/physical-expr/src/aggregate/row_accumulator.rs create mode 100644 datafusion/row/src/accessor.rs diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 533b38b81c7d7..37361cbb507f2 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -41,7 +41,7 @@ path = "src/lib.rs" # Used to enable the avro format avro = ["avro-rs", "num-traits", "datafusion-common/avro"] crypto_expressions = ["datafusion-physical-expr/crypto_expressions"] -default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] +default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) force_hash_collisions = [] # Used to enable JIT code generation diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs index 85e82f14c55d5..c0fcd5413f72e 100644 --- a/datafusion/core/src/physical_plan/aggregates/hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/hash.rs @@ -28,7 +28,9 @@ use futures::{ }; use crate::error::Result; -use crate::physical_plan::aggregates::{AccumulatorItem, AggregateMode}; +use crate::physical_plan::aggregates::{ + evaluate, evaluate_many, AccumulatorItem, AggregateMode, +}; use crate::physical_plan::hash_utils::create_hashes; use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; @@ -380,27 +382,6 @@ impl std::fmt::Debug for Accumulators { } } -/// Evaluates expressions against a record batch. -fn evaluate( - expr: &[Arc], - batch: &RecordBatch, -) -> Result> { - expr.iter() - .map(|expr| expr.evaluate(batch)) - .map(|r| r.map(|v| v.into_array(batch.num_rows()))) - .collect::>>() -} - -/// Evaluates expressions against a record batch. -fn evaluate_many( - expr: &[Vec>], - batch: &RecordBatch, -) -> Result>> { - expr.iter() - .map(|expr| evaluate(expr, batch)) - .collect::>>() -} - /// Create a RecordBatch with all group keys and accumulator' states or values. fn create_batch_from_map( mode: &AggregateMode, diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index af7df3dccfc71..d2cb486dba3ca 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -29,6 +29,7 @@ use crate::physical_plan::{ }; use arrow::array::ArrayRef; use arrow::datatypes::{Field, Schema, SchemaRef}; +use arrow::record_batch::RecordBatch; use async_trait::async_trait; use datafusion_common::Result; use datafusion_expr::Accumulator; @@ -41,6 +42,7 @@ use std::sync::Arc; mod hash; mod no_grouping; +mod row_hash; pub use datafusion_expr::AggregateFunction; pub use datafusion_physical_expr::expressions::create_aggregate_expr; @@ -404,6 +406,27 @@ fn finalize_aggregation( } } +/// Evaluates expressions against a record batch. +fn evaluate( + expr: &[Arc], + batch: &RecordBatch, +) -> Result> { + expr.iter() + .map(|expr| expr.evaluate(batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>() +} + +/// Evaluates expressions against a record batch. +fn evaluate_many( + expr: &[Vec>], + batch: &RecordBatch, +) -> Result>> { + expr.iter() + .map(|expr| evaluate(expr, batch)) + .collect::>>() +} + #[cfg(test)] mod tests { use crate::execution::context::TaskContext; diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs new file mode 100644 index 0000000000000..251adb23790b4 --- /dev/null +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -0,0 +1,412 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Hash aggregation through row format + +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::vec; + +use ahash::RandomState; +use futures::{ + ready, + stream::{Stream, StreamExt}, +}; + +use crate::error::Result; +use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode}; +use crate::physical_plan::hash_utils::create_row_hashes; +use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; +use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; +use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; + +use arrow::datatypes::Schema; +use arrow::{array::ArrayRef, compute}; +use arrow::{ + array::{Array, UInt32Builder}, + error::{ArrowError, Result as ArrowResult}, +}; +use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; +use datafusion_row::layout::RowLayout; +use datafusion_row::writer::{write_row, RowWriter}; +use datafusion_row::RowType; +use hashbrown::raw::RawTable; + +/* +The architecture is the following: + +1. An accumulator has state that is updated on each batch. +2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row +3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch. +4. The state's RecordBatch is `merge`d to a new state +5. The state is mapped to the final value + +Why: + +* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array` +* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge +* It uses Arrow's native dynamically typed object, `Array`. +* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant. + +Example: average + +* the state is `n: u32` and `sum: f64` +* For every batch, we update them accordingly. +* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]` +* The RecordBatch is (sent back / transmitted over network) +* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns. +* Finally, `get_value` returns an array with one entry computed from the state +*/ +pub(crate) struct GroupedRowHashAggregateStream { + schema: SchemaRef, + input: SendableRecordBatchStream, + mode: AggregateMode, + accumulators: Accumulators, + aggregate_expressions: Vec>>, + + aggr_expr: Vec>, + group_expr: Vec>, + + group_schema: SchemaRef, + aggr_schema: SchemaRef, + aggr_layout: RowLayout, + aggr_buffer_width: usize, + + baseline_metrics: BaselineMetrics, + random_state: RandomState, + finished: bool, +} + +fn create_separate_schema(schema: &Schema, group_count: usize) -> (SchemaRef, SchemaRef) { + let (group_fields, aggr_fields) = schema.fields().split_at(group_count); + ( + Arc::new(Schema::new(group_fields.to_vec())), + Arc::new(Schema::new(aggr_fields.to_vec())), + ) +} + +impl GroupedRowHashAggregateStream { + /// Create a new GroupedRowHashAggregateStream + pub fn new( + mode: AggregateMode, + schema: SchemaRef, + group_expr: Vec>, + aggr_expr: Vec>, + input: SendableRecordBatchStream, + baseline_metrics: BaselineMetrics, + ) -> Result { + let timer = baseline_metrics.elapsed_compute().timer(); + + // The expressions to evaluate the batch, one vec of expressions per aggregation. + // Assume create_schema() always put group columns in front of aggr columns, we set + // col_idx_base to group expression count. + let aggregate_expressions = + aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?; + + let (group_schema, aggr_schema) = + create_separate_schema(&schema, group_expr.len()); + let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned); + let aggr_buffer_width = aggr_layout.fixed_part_width(); + timer.done(); + + Ok(Self { + schema, + mode, + input, + aggr_expr, + group_expr, + group_schema, + aggr_schema, + aggr_layout, + aggr_buffer_width, + baseline_metrics, + aggregate_expressions, + accumulators: Default::default(), + random_state: Default::default(), + finished: false, + }) + } +} + +impl Stream for GroupedRowHashAggregateStream { + type Item = ArrowResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let this = &mut *self; + if this.finished { + return Poll::Ready(None); + } + + let elapsed_compute = this.baseline_metrics.elapsed_compute(); + + loop { + let result = match ready!(this.input.poll_next_unpin(cx)) { + Some(Ok(batch)) => { + let timer = elapsed_compute.timer(); + let result = group_aggregate_batch( + &this.mode, + &this.random_state, + &this.group_expr, + &this.aggr_expr, + &this.group_schema, + &this.aggr_schema, + &this.aggr_layout, + this.aggr_buffer_width, + batch, + &mut this.accumulators, + &this.aggregate_expressions, + ); + + timer.done(); + + match result { + Ok(_) => continue, + Err(e) => Err(ArrowError::ExternalError(Box::new(e))), + } + } + Some(Err(e)) => Err(e), + None => { + this.finished = true; + let timer = this.baseline_metrics.elapsed_compute().timer(); + let result = create_batch_from_map( + &this.mode, + &this.accumulators, + this.group_expr.len(), + &this.schema, + ) + .record_output(&this.baseline_metrics); + + timer.done(); + result + } + }; + + this.finished = true; + return Poll::Ready(Some(result)); + } + } +} + +impl RecordBatchStream for GroupedRowHashAggregateStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +/// TODO: Make this a member function of [`GroupedRowHashAggregateStream`] +fn group_aggregate_batch( + mode: &AggregateMode, + random_state: &RandomState, + group_expr: &[Arc], + aggr_expr: &[Arc], + group_schema: &Schema, + aggr_schema: &Schema, + aggr_row_layout: &RowLayout, + aggr_buffer_width: usize, + batch: RecordBatch, + accumulators: &mut Accumulators, + aggregate_expressions: &[Vec>], +) -> Result<()> { + // evaluate the grouping expressions + let group_values = evaluate(group_expr, &batch)?; + let group_rows: Vec> = create_group_rows(group_values, group_schema); + + // evaluate the aggregation expressions. + // We could evaluate them after the `take`, but since we need to evaluate all + // of them anyways, it is more performant to do it while they are together. + let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?; + + // 1.1 construct the key from the group values + // 1.2 construct the mapping key if it does not exist + // 1.3 add the row' index to `indices` + + // track which entries in `accumulators` have rows in this batch to aggregate + let mut groups_with_rows = vec![]; + + // 1.1 Calculate the group keys for the group values + let mut batch_hashes = vec![0; batch.num_rows()]; + create_row_hashes(&group_rows, random_state, &mut batch_hashes)?; + + for (row, hash) in batch_hashes.into_iter().enumerate() { + let Accumulators { map, group_states } = accumulators; + + let entry = map.get_mut(hash, |(_hash, group_idx)| { + // verify that a group that we are inserting with hash is + // actually the same key value as the group in + // existing_idx (aka group_values @ row) + let group_state = &group_states[*group_idx]; + group_rows[row] == group_state.group_by_values + }); + + match entry { + // Existing entry for this group value + Some((_hash, group_idx)) => { + let group_state = &mut group_states[*group_idx]; + // 1.3 + if group_state.indices.is_empty() { + groups_with_rows.push(*group_idx); + }; + group_state.indices.push(row as u32); // remember this row + } + // 1.2 Need to create new entry + None => { + // Add new entry to group_states and save newly created index + let group_state = RowGroupState { + group_by_values: group_rows[row].clone(), + aggregation_buffer: Vec::with_capacity(aggr_buffer_width), + indices: vec![row as u32], // 1.3 + }; + let group_idx = group_states.len(); + group_states.push(group_state); + groups_with_rows.push(group_idx); + + // for hasher function, use precomputed hash value + map.insert(hash, (hash, group_idx), |(hash, _group_idx)| *hash); + } + }; + } + + // Collect all indices + offsets based on keys in this vec + let mut batch_indices: UInt32Builder = UInt32Builder::new(0); + let mut offsets = vec![0]; + let mut offset_so_far = 0; + for group_idx in groups_with_rows.iter() { + let indices = &accumulators.group_states[*group_idx].indices; + batch_indices.append_slice(indices)?; + offset_so_far += indices.len(); + offsets.push(offset_so_far); + } + let batch_indices = batch_indices.finish(); + + // `Take` all values based on indices into Arrays + let values: Vec>> = aggr_input_values + .iter() + .map(|array| { + array + .iter() + .map(|array| { + compute::take( + array.as_ref(), + &batch_indices, + None, // None: no index check + ) + .unwrap() + }) + .collect() + // 2.3 + }) + .collect(); + + // 2.1 for each key in this batch + // 2.2 for each aggregation + // 2.3 `slice` from each of its arrays the keys' values + // 2.4 update / merge the accumulator with the values + // 2.5 clear indices + groups_with_rows + .iter() + .zip(offsets.windows(2)) + .try_for_each(|(group_idx, offsets)| { + let group_state = &mut accumulators.group_states[*group_idx]; + // 2.2 + group_state + .accumulator_set + .iter_mut() + .zip(values.iter()) + .map(|(accumulator, aggr_array)| { + ( + accumulator, + aggr_array + .iter() + .map(|array| { + // 2.3 + array.slice(offsets[0], offsets[1] - offsets[0]) + }) + .collect::>(), + ) + }) + .try_for_each(|(accumulator, values)| match mode { + AggregateMode::Partial => accumulator.update_batch(&values), + AggregateMode::FinalPartitioned | AggregateMode::Final => { + // note: the aggregation here is over states, not values, thus the merge + accumulator.merge_batch(&values) + } + }) + // 2.5 + .and({ + group_state.indices.clear(); + Ok(()) + }) + })?; + + Ok(()) +} + +/// The state that is built for each output group. +#[derive(Debug)] +struct RowGroupState { + /// The actual group by values, stored sequentially + group_by_values: Vec, + + // Accumulator state, stored sequentially + aggregation_buffer: Vec, + + /// scratch space used to collect indices for input rows in a + /// bach that have values to aggregate. Reset on each batch + indices: Vec, +} + +/// The state of all the groups +#[derive(Default)] +struct Accumulators { + /// Logically maps group values to an index in `group_states` + /// + /// Uses the raw API of hashbrown to avoid actually storing the + /// keys in the table + /// + /// keys: u64 hashes of the GroupValue + /// values: (hash, index into `group_states`) + map: RawTable<(u64, usize)>, + + /// State for each group + group_states: Vec, +} + +impl std::fmt::Debug for Accumulators { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + // hashes are not store inline, so could only get values + let map_string = "RawTable"; + f.debug_struct("RowAccumulators") + .field("map", &map_string) + .field("row_group_states", &self.group_states) + .finish() + } +} + +/// Create grouping rows +fn create_group_rows(arrays: Vec, schema: &Schema) -> Vec> { + let mut writer = RowWriter::new(schema, RowType::Compact); + let mut results = vec![]; + for cur_row in 0..arrays[0].len() { + write_row(&mut writer, cur_row, schema, &arrays); + results.push(writer.get_row().to_vec()); + writer.reset() + } + results +} diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs index 4e503b19e7bf3..3c0207a863cf7 100644 --- a/datafusion/core/src/physical_plan/hash_utils.rs +++ b/datafusion/core/src/physical_plan/hash_utils.rs @@ -268,6 +268,41 @@ pub fn create_hashes<'a>( return Ok(hashes_buffer); } +/// Test version of `create_row_hashes` that produces the same value for +/// all hashes (to test collisions) +/// +/// See comments on `hashes_buffer` for more details +#[cfg(feature = "force_hash_collisions")] +pub fn create_row_hashes<'a>( + _rows: &[Vec], + _random_state: &RandomState, + hashes_buffer: &'a mut Vec, +) -> Result<&'a mut Vec> { + for hash in hashes_buffer.iter_mut() { + *hash = 0 + } + return Ok(hashes_buffer); +} + +/// Test version of `create_row_hashes` that produces the same value for +/// all hashes (to test collisions) +/// +/// See comments on `hashes_buffer` for more details +#[cfg(not(feature = "force_hash_collisions"))] +pub fn create_row_hashes<'a>( + rows: &[Vec], + random_state: &RandomState, + hashes_buffer: &'a mut Vec, +) -> Result<&'a mut Vec> { + for hash in hashes_buffer.iter_mut() { + *hash = 0 + } + for (i, hash) in hashes_buffer.iter_mut().enumerate() { + *hash = >::get_hash(&rows[i], random_state); + } + return Ok(hashes_buffer); +} + /// Creates hash values for every row, based on the values in the /// columns. /// diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 2ae5fa3198408..3df9e984b3a40 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -34,8 +34,9 @@ path = "src/lib.rs" [features] crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] -default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] +default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"] regex_expressions = ["regex"] +row = ["datafusion-row"] unicode_expressions = ["unicode-segmentation"] [dependencies] @@ -46,6 +47,7 @@ blake3 = { version = "1.0", optional = true } chrono = { version = "0.4", default-features = false } datafusion-common = { path = "../common", version = "7.0.0" } datafusion-expr = { path = "../expr", version = "7.0.0" } +datafusion-row = { path = "../row", version = "7.0.0", optional = true } hashbrown = { version = "0.12", features = ["raw"] } lazy_static = { version = "^1.4.0" } md-5 = { version = "^0.10.0", optional = true } diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index 637a7f99d35ca..a917af3993732 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -21,6 +21,7 @@ use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::aggregate::sum; use crate::expressions::format_state_name; use crate::{AggregateExpr, PhysicalExpr}; @@ -33,6 +34,7 @@ use arrow::{ use datafusion_common::ScalarValue; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::Accumulator; +use datafusion_row::accessor::RowAccessor; /// AVG aggregate expression #[derive(Debug)] @@ -101,6 +103,22 @@ impl AggregateExpr for Avg { fn name(&self) -> &str { &self.name } + + fn row_state_supported(&self) -> bool { + matches!( + self.data_type, + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + ) + } } /// An accumulator to compute the average @@ -167,6 +185,59 @@ impl Accumulator for AvgAccumulator { } } +#[derive(Debug)] +struct AvgRowAccumulator { + start_index: usize, + sum_datatype: DataType, +} + +impl AvgRowAccumulator { + pub fn new(start_index: usize, sum_datatype: DataType) -> Self { + Self { + start_index, + sum_datatype, + } + } +} + +impl RowAccumulator for AvgRowAccumulator { + fn update_batch( + &mut self, + values: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + let values = &values[0]; + + let delta = (values.len() - values.data().null_count()) as u64; + accessor.add_u64(self.start_index, delta); + sum::add_to_row( + &self.sum_datatype, + self.start_index + 1, + accessor, + &sum::sum_batch(values)?, + )?; + Ok(()) + } + + fn merge_batch( + &mut self, + states: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + let counts = states[0].as_any().downcast_ref::().unwrap(); + let delta = compute::sum(counts).unwrap_or(0); + accessor.add_u64(self.start_index, delta); + + sum::add_to_row( + &self.sum_datatype, + self.start_index + 1, + accessor, + &sum::sum_batch(&states[1])?, + )?; + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs index 9e8485e928c25..b3210900fca6c 100644 --- a/datafusion/physical-expr/src/aggregate/count.rs +++ b/datafusion/physical-expr/src/aggregate/count.rs @@ -18,8 +18,10 @@ //! Defines physical expressions that can evaluated at runtime during query execution use std::any::Any; +use std::fmt::Debug; use std::sync::Arc; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::{AggregateExpr, PhysicalExpr}; use arrow::compute; use arrow::datatypes::DataType; @@ -30,6 +32,7 @@ use arrow::{ use datafusion_common::Result; use datafusion_common::ScalarValue; use datafusion_expr::Accumulator; +use datafusion_row::accessor::RowAccessor; use crate::expressions::format_state_name; @@ -92,6 +95,10 @@ impl AggregateExpr for Count { fn name(&self) -> &str { &self.name } + + fn row_state_supported(&self) -> bool { + true + } } #[derive(Debug)] @@ -131,6 +138,43 @@ impl Accumulator for CountAccumulator { } } +#[derive(Debug)] +struct CountRowAccumulator { + index: usize, +} + +impl CountRowAccumulator { + pub fn new(index: usize) -> Self { + Self { index } + } +} + +impl RowAccumulator for CountRowAccumulator { + fn update_batch( + &mut self, + values: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + let array = &values[0]; + let delta = (array.len() - array.data().null_count()) as u64; + accessor.add_u64(self.index, delta); + Ok(()) + } + + fn merge_batch( + &mut self, + states: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + let counts = states[0].as_any().downcast_ref::().unwrap(); + let delta = &compute::sum(counts); + if let Some(d) = delta { + accessor.add_u64(self.index, *d); + } + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 7de10e4b8a7e3..55ab77cbd647c 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -37,9 +37,11 @@ use datafusion_common::ScalarValue; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::Accumulator; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::expressions::format_state_name; use arrow::array::Array; use arrow::array::DecimalArray; +use datafusion_row::accessor::RowAccessor; // Min/max aggregation can take Dictionary encode input but always produces unpacked // (aka non Dictionary) output. We need to adjust the output data type to reflect this. @@ -111,6 +113,22 @@ impl AggregateExpr for Max { fn name(&self) -> &str { &self.name } + + fn row_state_supported(&self) -> bool { + matches!( + self.data_type, + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + ) + } } // Statically-typed version of min/max(array) -> ScalarValue for string types. @@ -303,6 +321,18 @@ macro_rules! typed_min_max { }}; } +// min/max of two non-string scalar values. +macro_rules! typed_min_max_row { + ($INDEX:ident, $ACC:ident, $SCALAR:expr, $TYPE:ident, $OP:ident) => {{ + paste::item! { + match $SCALAR { + None => {} + Some(v) => $ACC.[<$OP _ $TYPE>]($INDEX, *v as $TYPE) + } + } + }}; +} + // min/max of two scalar string values. macro_rules! typed_min_max_string { ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident) => {{ @@ -408,16 +438,68 @@ macro_rules! min_max { }}; } +// min/max of two scalar values of the same type +macro_rules! min_max_row { + ($INDEX:ident, $ACC:ident, $SCALAR:expr, $OP:ident) => {{ + Ok(match $SCALAR { + ScalarValue::Float64(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, f64, $OP) + } + ScalarValue::Float32(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, f32, $OP) + } + ScalarValue::UInt64(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, u64, $OP) + } + ScalarValue::UInt32(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, u32, $OP) + } + ScalarValue::UInt16(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, u16, $OP) + } + ScalarValue::UInt8(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, u8, $OP) + } + ScalarValue::Int64(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, i64, $OP) + } + ScalarValue::Int32(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, i32, $OP) + } + ScalarValue::Int16(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, i16, $OP) + } + ScalarValue::Int8(rhs) => { + typed_min_max_row!($INDEX, $ACC, rhs, i8, $OP) + } + e => { + return Err(DataFusionError::Internal(format!( + "MIN/MAX is not expected to receive scalars of incompatible types {:?}", + e + ))) + } + }) + }}; +} + /// the minimum of two scalar values pub fn min(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { min_max!(lhs, rhs, min) } +pub fn min_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> { + min_max_row!(index, accessor, s, min) +} + /// the maximum of two scalar values pub fn max(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { min_max!(lhs, rhs, max) } +pub fn max_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> { + min_max_row!(index, accessor, s, max) +} + /// An accumulator to compute the maximum value #[derive(Debug)] pub struct MaxAccumulator { @@ -454,6 +536,38 @@ impl Accumulator for MaxAccumulator { } } +#[derive(Debug)] +struct MaxRowAccumulator { + index: usize, +} + +impl MaxRowAccumulator { + pub fn new(index: usize) -> Self { + Self { index } + } +} + +impl RowAccumulator for MaxRowAccumulator { + fn update_batch( + &mut self, + values: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + let values = &values[0]; + let delta = &max_batch(values)?; + max_row(self.index, accessor, delta)?; + Ok(()) + } + + fn merge_batch( + &mut self, + states: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + self.update_batch(states, accessor) + } +} + /// MIN aggregate expression #[derive(Debug)] pub struct Min { @@ -512,6 +626,22 @@ impl AggregateExpr for Min { fn name(&self) -> &str { &self.name } + + fn row_state_supported(&self) -> bool { + matches!( + self.data_type, + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + ) + } } /// An accumulator to compute the minimum value @@ -550,6 +680,38 @@ impl Accumulator for MinAccumulator { } } +#[derive(Debug)] +struct MinRowAccumulator { + index: usize, +} + +impl MinRowAccumulator { + pub fn new(index: usize) -> Self { + Self { index } + } +} + +impl RowAccumulator for MinRowAccumulator { + fn update_batch( + &mut self, + values: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + let values = &values[0]; + let delta = &min_batch(values)?; + min_row(self.index, accessor, delta)?; + Ok(()) + } + + fn merge_batch( + &mut self, + states: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + self.update_batch(states, accessor) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 019a60cd57607..13bef9018e5a9 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -38,6 +38,7 @@ pub(crate) mod distinct_expressions; pub(crate) mod min_max; pub mod build_in; mod hyperloglog; +pub mod row_accumulator; pub(crate) mod stats; pub(crate) mod stddev; pub(crate) mod sum; @@ -75,4 +76,9 @@ pub trait AggregateExpr: Send + Sync + Debug { fn name(&self) -> &str { "AggregateExpr: default name" } + + /// If the aggregate expression is supported by row format + fn row_state_supported(&self) -> bool { + false + } } diff --git a/datafusion/physical-expr/src/aggregate/row_accumulator.rs b/datafusion/physical-expr/src/aggregate/row_accumulator.rs new file mode 100644 index 0000000000000..a0024f2eb3f22 --- /dev/null +++ b/datafusion/physical-expr/src/aggregate/row_accumulator.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Accumulator in raw format + +use arrow::array::ArrayRef; +use datafusion_common::Result; +use datafusion_row::accessor::RowAccessor; +use std::fmt::Debug; + +pub trait RowAccumulator: Send + Sync + Debug { + /// updates the accumulator's state from a vector of arrays. + fn update_batch( + &mut self, + values: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()>; + + /// updates the accumulator's state from a vector of states. + fn merge_batch( + &mut self, + states: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()>; +} diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs index 12572f9a9324a..82c34df3786f5 100644 --- a/datafusion/physical-expr/src/aggregate/sum.rs +++ b/datafusion/physical-expr/src/aggregate/sum.rs @@ -34,9 +34,11 @@ use arrow::{ use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::Accumulator; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::expressions::format_state_name; use arrow::array::Array; use arrow::array::DecimalArray; +use datafusion_row::accessor::RowAccessor; /// SUM aggregate expression #[derive(Debug)] @@ -96,6 +98,22 @@ impl AggregateExpr for Sum { fn name(&self) -> &str { &self.name } + + fn row_state_supported(&self) -> bool { + matches!( + self.data_type, + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + ) + } } #[derive(Debug)] @@ -180,6 +198,17 @@ macro_rules! typed_sum { }}; } +macro_rules! sum_row { + ($INDEX:ident, $ACC:ident, $DELTA:expr, $TYPE:ident) => {{ + paste::item! { + match $DELTA { + None => {} + Some(v) => $ACC.[]($INDEX, *v as $TYPE) + } + } + }}; +} + // TODO implement this in arrow-rs with simd // https://github.com/apache/arrow-rs/issues/1010 fn sum_decimal( @@ -284,7 +313,7 @@ pub(crate) fn sum(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { (ScalarValue::UInt64(lhs), ScalarValue::UInt8(rhs)) => { typed_sum!(lhs, rhs, UInt64, u64) } - // i64 coerces i* to u64 + // i64 coerces i* to i64 (ScalarValue::Int64(lhs), ScalarValue::Int64(rhs)) => { typed_sum!(lhs, rhs, Int64, i64) } @@ -306,6 +335,84 @@ pub(crate) fn sum(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { }) } +pub(crate) fn add_to_row( + dt: &DataType, + index: usize, + accessor: &mut RowAccessor, + s: &ScalarValue, +) -> Result<()> { + match (dt, s) { + // float64 coerces everything to f64 + (DataType::Float64, ScalarValue::Float64(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::Float32(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::Int64(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::Int32(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::Int16(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::Int8(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::UInt64(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::UInt32(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::UInt16(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + (DataType::Float64, ScalarValue::UInt8(rhs)) => { + sum_row!(index, accessor, rhs, f64) + } + // float32 has no cast + (DataType::Float32, ScalarValue::Float32(rhs)) => { + sum_row!(index, accessor, rhs, f32) + } + // u64 coerces u* to u64 + (DataType::UInt64, ScalarValue::UInt64(rhs)) => { + sum_row!(index, accessor, rhs, u64) + } + (DataType::UInt64, ScalarValue::UInt32(rhs)) => { + sum_row!(index, accessor, rhs, u64) + } + (DataType::UInt64, ScalarValue::UInt16(rhs)) => { + sum_row!(index, accessor, rhs, u64) + } + (DataType::UInt64, ScalarValue::UInt8(rhs)) => { + sum_row!(index, accessor, rhs, u64) + } + // i64 coerces i* to i64 + (DataType::Int64, ScalarValue::Int64(rhs)) => { + sum_row!(index, accessor, rhs, i64) + } + (DataType::Int64, ScalarValue::Int32(rhs)) => { + sum_row!(index, accessor, rhs, i64) + } + (DataType::Int64, ScalarValue::Int16(rhs)) => { + sum_row!(index, accessor, rhs, i64) + } + (DataType::Int64, ScalarValue::Int8(rhs)) => { + sum_row!(index, accessor, rhs, i64) + } + e => { + return Err(DataFusionError::Internal(format!( + "Row sum updater is not expected to receive a scalar {:?}", + e + ))); + } + } + Ok(()) +} + impl Accumulator for SumAccumulator { fn state(&self) -> Result> { Ok(vec![self.sum.clone()]) @@ -329,6 +436,38 @@ impl Accumulator for SumAccumulator { } } +#[derive(Debug)] +struct SumRowAccumulator { + index: usize, + datatype: DataType, +} + +impl SumRowAccumulator { + pub fn new(index: usize, datatype: DataType) -> Self { + Self { index, datatype } + } +} + +impl RowAccumulator for SumRowAccumulator { + fn update_batch( + &mut self, + values: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + let values = &values[0]; + add_to_row(&self.datatype, self.index, accessor, &sum_batch(values)?)?; + Ok(()) + } + + fn merge_batch( + &mut self, + states: &[ArrayRef], + accessor: &mut RowAccessor, + ) -> Result<()> { + self.update_batch(states, accessor) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/row/src/accessor.rs b/datafusion/row/src/accessor.rs new file mode 100644 index 0000000000000..7b204fe85b7c2 --- /dev/null +++ b/datafusion/row/src/accessor.rs @@ -0,0 +1,302 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Setter/Getter for row with all fixed-sized fields. + +use crate::layout::{RowLayout, RowType}; +use crate::validity::{all_valid, NullBitsFormatter}; +use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx, get_idx, set_idx}; +use arrow::datatypes::Schema; +use arrow::util::bit_util::{get_bit_raw, set_bit_raw, unset_bit_raw}; + +//TODO: DRY with reader and writer + +/// Read the tuple `data[base_offset..]` we are currently pointing to +pub struct RowAccessor<'a> { + /// Layout on how to read each field + layout: RowLayout, + /// Raw bytes slice where the tuple stores + data: &'a mut [u8], + /// Start position for the current tuple in the raw bytes slice. + base_offset: usize, +} + +impl<'a> std::fmt::Debug for RowAccessor<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.null_free() { + write!(f, "null_free") + } else { + let null_bits = self.null_bits(); + write!( + f, + "{:?}", + NullBitsFormatter::new(null_bits, self.layout.field_count) + ) + } + } +} + +#[macro_export] +macro_rules! fn_add_idx { + ($NATIVE: ident) => { + paste::item! { + /// add field at `idx` with `value` + pub fn [](&mut self, idx: usize, value: $NATIVE) { + if self.is_valid_at(idx) { + self.[](idx, value + self.[](idx)); + } else { + self.set_non_null_at(idx); + self.[](idx, value); + } + } + } + }; +} + +macro_rules! fn_max_min_idx { + ($NATIVE: ident, $OP: ident) => { + paste::item! { + /// check max then update + pub fn [<$OP _ $NATIVE>](&mut self, idx: usize, value: $NATIVE) { + if self.is_valid_at(idx) { + let v = value.$OP(self.[](idx)); + self.[](idx, v); + } else { + self.set_non_null_at(idx); + self.[](idx, value); + } + } + } + }; +} + +impl<'a> RowAccessor<'a> { + /// new + pub fn new(schema: &Schema, row_type: RowType) -> Self { + Self { + layout: RowLayout::new(schema, row_type), + data: &mut [], + base_offset: 0, + } + } + + /// Update this row to point to position `offset` in `base` + pub fn point_to(&mut self, offset: usize, data: &'a mut [u8]) { + self.base_offset = offset; + self.data = data; + } + + #[inline] + fn assert_index_valid(&self, idx: usize) { + assert!(idx < self.layout.field_count); + } + + #[inline(always)] + fn field_offsets(&self) -> &[usize] { + &self.layout.field_offsets + } + + #[inline(always)] + fn null_free(&self) -> bool { + self.layout.null_free + } + + #[inline(always)] + fn null_bits(&self) -> &[u8] { + if self.null_free() { + &[] + } else { + let start = self.base_offset; + &self.data[start..start + self.layout.null_width] + } + } + + #[inline(always)] + fn all_valid(&self) -> bool { + if self.null_free() { + true + } else { + let null_bits = self.null_bits(); + all_valid(null_bits, self.layout.field_count) + } + } + + fn is_valid_at(&self, idx: usize) -> bool { + unsafe { get_bit_raw(self.null_bits().as_ptr(), idx) } + } + + // ------------------------------ + // ----- Fixed Sized getters ---- + // ------------------------------ + + fn get_bool(&self, idx: usize) -> bool { + self.assert_index_valid(idx); + let offset = self.field_offsets()[idx]; + let value = &self.data[self.base_offset + offset..]; + value[0] != 0 + } + + fn get_u8(&self, idx: usize) -> u8 { + self.assert_index_valid(idx); + let offset = self.field_offsets()[idx]; + self.data[self.base_offset + offset] + } + + fn_get_idx!(u16, 2); + fn_get_idx!(u32, 4); + fn_get_idx!(u64, 8); + fn_get_idx!(i8, 1); + fn_get_idx!(i16, 2); + fn_get_idx!(i32, 4); + fn_get_idx!(i64, 8); + fn_get_idx!(f32, 4); + fn_get_idx!(f64, 8); + + fn get_date32(&self, idx: usize) -> i32 { + get_idx!(i32, self, idx, 4) + } + + fn get_date64(&self, idx: usize) -> i64 { + get_idx!(i64, self, idx, 8) + } + + fn_get_idx_opt!(bool); + fn_get_idx_opt!(u8); + fn_get_idx_opt!(u16); + fn_get_idx_opt!(u32); + fn_get_idx_opt!(u64); + fn_get_idx_opt!(i8); + fn_get_idx_opt!(i16); + fn_get_idx_opt!(i32); + fn_get_idx_opt!(i64); + fn_get_idx_opt!(f32); + fn_get_idx_opt!(f64); + + fn get_date32_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_date32(idx)) + } else { + None + } + } + + fn get_date64_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_date64(idx)) + } else { + None + } + } + + // ------------------------------ + // ----- Fixed Sized setters ---- + // ------------------------------ + + pub(crate) fn set_null_at(&mut self, idx: usize) { + assert!( + !self.null_free(), + "Unexpected call to set_null_at on null-free row writer" + ); + let null_bits = &mut self.data[0..self.layout.null_width]; + unsafe { + unset_bit_raw(null_bits.as_mut_ptr(), idx); + } + } + + pub(crate) fn set_non_null_at(&mut self, idx: usize) { + assert!( + !self.null_free(), + "Unexpected call to set_non_null_at on null-free row writer" + ); + let null_bits = &mut self.data[0..self.layout.null_width]; + unsafe { + set_bit_raw(null_bits.as_mut_ptr(), idx); + } + } + + fn set_bool(&mut self, idx: usize, value: bool) { + self.assert_index_valid(idx); + let offset = self.field_offsets()[idx]; + self.data[offset] = if value { 1 } else { 0 }; + } + + fn set_u8(&mut self, idx: usize, value: u8) { + self.assert_index_valid(idx); + let offset = self.field_offsets()[idx]; + self.data[offset] = value; + } + + fn_set_idx!(u16, 2); + fn_set_idx!(u32, 4); + fn_set_idx!(u64, 8); + fn_set_idx!(i16, 2); + fn_set_idx!(i32, 4); + fn_set_idx!(i64, 8); + fn_set_idx!(f32, 4); + fn_set_idx!(f64, 8); + + fn set_i8(&mut self, idx: usize, value: i8) { + self.assert_index_valid(idx); + let offset = self.field_offsets()[idx]; + self.data[offset] = value.to_le_bytes()[0]; + } + + fn set_date32(&mut self, idx: usize, value: i32) { + set_idx!(4, self, idx, value) + } + + fn set_date64(&mut self, idx: usize, value: i64) { + set_idx!(8, self, idx, value) + } + + // ------------------------------ + // ---- Fixed sized updaters ---- + // ------------------------------ + + fn_add_idx!(u8); + fn_add_idx!(u16); + fn_add_idx!(u32); + fn_add_idx!(u64); + fn_add_idx!(i8); + fn_add_idx!(i16); + fn_add_idx!(i32); + fn_add_idx!(i64); + fn_add_idx!(f32); + fn_add_idx!(f64); + + fn_max_min_idx!(u8, max); + fn_max_min_idx!(u16, max); + fn_max_min_idx!(u32, max); + fn_max_min_idx!(u64, max); + fn_max_min_idx!(i8, max); + fn_max_min_idx!(i16, max); + fn_max_min_idx!(i32, max); + fn_max_min_idx!(i64, max); + fn_max_min_idx!(f32, max); + fn_max_min_idx!(f64, max); + + fn_max_min_idx!(u8, min); + fn_max_min_idx!(u16, min); + fn_max_min_idx!(u32, min); + fn_max_min_idx!(u64, min); + fn_max_min_idx!(i8, min); + fn_max_min_idx!(i16, min); + fn_max_min_idx!(i32, min); + fn_max_min_idx!(i64, min); + fn_max_min_idx!(f32, min); + fn_max_min_idx!(f64, min); +} diff --git a/datafusion/row/src/layout.rs b/datafusion/row/src/layout.rs index b017d195836d4..2c4c15da5a09e 100644 --- a/datafusion/row/src/layout.rs +++ b/datafusion/row/src/layout.rs @@ -39,7 +39,7 @@ pub enum RowType { /// Reveals how the fields of a record are stored in the raw-bytes format #[derive(Debug)] -pub(crate) struct RowLayout { +pub struct RowLayout { /// Type of the layout row_type: RowType, /// If a row is null free according to its schema @@ -55,7 +55,8 @@ pub(crate) struct RowLayout { } impl RowLayout { - pub(crate) fn new(schema: &Schema, row_type: RowType) -> Self { + /// new + pub fn new(schema: &Schema, row_type: RowType) -> Self { assert!(row_supported(schema, row_type)); let null_free = schema_null_free(schema); let field_count = schema.fields().len(); @@ -81,8 +82,9 @@ impl RowLayout { } } + /// Get fixed part width for this layout #[inline(always)] - pub(crate) fn fixed_part_width(&self) -> usize { + pub fn fixed_part_width(&self) -> usize { self.null_width + self.values_width } } diff --git a/datafusion/row/src/lib.rs b/datafusion/row/src/lib.rs index 54c112dd5e063..f954b16bc36cd 100644 --- a/datafusion/row/src/lib.rs +++ b/datafusion/row/src/lib.rs @@ -54,6 +54,7 @@ use arrow::record_batch::RecordBatch; pub use layout::RowType; use std::sync::Arc; +pub mod accessor; #[cfg(feature = "jit")] pub mod jit; pub mod layout; diff --git a/datafusion/row/src/reader.rs b/datafusion/row/src/reader.rs index e7ee004b0076d..77e9a552cbf84 100644 --- a/datafusion/row/src/reader.rs +++ b/datafusion/row/src/reader.rs @@ -46,6 +46,7 @@ pub fn read_as_batch( output.output().map_err(DataFusionError::ArrowError) } +#[macro_export] macro_rules! get_idx { ($NATIVE: ident, $SELF: ident, $IDX: ident, $WIDTH: literal) => {{ $SELF.assert_index_valid($IDX); @@ -56,6 +57,7 @@ macro_rules! get_idx { }}; } +#[macro_export] macro_rules! fn_get_idx { ($NATIVE: ident, $WIDTH: literal) => { paste::item! { @@ -70,6 +72,7 @@ macro_rules! fn_get_idx { }; } +#[macro_export] macro_rules! fn_get_idx_opt { ($NATIVE: ident) => { paste::item! { diff --git a/datafusion/row/src/writer.rs b/datafusion/row/src/writer.rs index 6b9ffdc0e31d5..d71e1dbc073c1 100644 --- a/datafusion/row/src/writer.rs +++ b/datafusion/row/src/writer.rs @@ -75,6 +75,7 @@ pub fn bench_write_batch( Ok(lengths) } +#[macro_export] macro_rules! set_idx { ($WIDTH: literal, $SELF: ident, $IDX: ident, $VALUE: ident) => {{ $SELF.assert_index_valid($IDX); @@ -83,6 +84,7 @@ macro_rules! set_idx { }}; } +#[macro_export] macro_rules! fn_set_idx { ($NATIVE: ident, $WIDTH: literal) => { paste::item! { From ddfd601d20f471112dc3be61c90411e71aa7375e Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Fri, 29 Apr 2022 13:25:38 +0800 Subject: [PATCH 3/8] main updating procedure --- .../core/src/physical_plan/aggregates/mod.rs | 18 +++++++++++ .../src/physical_plan/aggregates/row_hash.rs | 30 ++++++++++++++----- .../physical-expr/src/aggregate/average.rs | 7 +++++ .../physical-expr/src/aggregate/count.rs | 4 +++ .../physical-expr/src/aggregate/min_max.rs | 8 +++++ datafusion/physical-expr/src/aggregate/mod.rs | 5 ++++ datafusion/physical-expr/src/aggregate/sum.rs | 4 +++ 7 files changed, 68 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index d2cb486dba3ca..a91fb2da5102a 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -45,6 +45,7 @@ mod no_grouping; mod row_hash; pub use datafusion_expr::AggregateFunction; +use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator; pub use datafusion_physical_expr::expressions::create_aggregate_expr; /// Hash aggregate modes @@ -366,6 +367,7 @@ fn merge_expressions( } pub(crate) type AccumulatorItem = Box; +pub(crate) type AccumulatorItemV2 = Box; fn create_accumulators( aggr_expr: &[Arc], @@ -376,6 +378,22 @@ fn create_accumulators( .collect::>>() } +fn check_accumulator_v2_supported( + aggr_expr: &[Arc] +) -> bool { + aggr_expr.iter().all(|expr| expr.row_state_supported()) +} + +fn create_accumulators_v2( + aggr_expr: &[Arc], +) -> datafusion_common::Result> { + aggr_expr + .iter() + .enumerate() + .map(|(idx, expr)| expr.create_accumulator_v2(idx)) + .collect::>>() +} + /// returns a vector of ArrayRefs, where each entry corresponds to either the /// final value (mode = Final) or states (mode = Partial) fn finalize_aggregation( diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index 251adb23790b4..943f38f9c02d0 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -28,7 +28,7 @@ use futures::{ }; use crate::error::Result; -use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode}; +use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode, AccumulatorItemV2}; use crate::physical_plan::hash_utils::create_row_hashes; use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; @@ -45,6 +45,7 @@ use datafusion_row::layout::RowLayout; use datafusion_row::writer::{write_row, RowWriter}; use datafusion_row::RowType; use hashbrown::raw::RawTable; +use datafusion_row::accessor::RowAccessor; /* The architecture is the following: @@ -80,6 +81,7 @@ pub(crate) struct GroupedRowHashAggregateStream { aggr_expr: Vec>, group_expr: Vec>, + accs_v2: Vec, group_schema: SchemaRef, aggr_schema: SchemaRef, @@ -117,6 +119,8 @@ impl GroupedRowHashAggregateStream { let aggregate_expressions = aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?; + let accs_v2 = aggregates::create_accumulators_v2(&aggr_expr)?; + let (group_schema, aggr_schema) = create_separate_schema(&schema, group_expr.len()); let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned); @@ -129,6 +133,7 @@ impl GroupedRowHashAggregateStream { input, aggr_expr, group_expr, + accs_v2, group_schema, aggr_schema, aggr_layout, @@ -165,6 +170,7 @@ impl Stream for GroupedRowHashAggregateStream { &this.random_state, &this.group_expr, &this.aggr_expr, + &mut this.accs_v2, &this.group_schema, &this.aggr_schema, &this.aggr_layout, @@ -216,6 +222,7 @@ fn group_aggregate_batch( random_state: &RandomState, group_expr: &[Arc], aggr_expr: &[Arc], + accs_v2: &mut [AccumulatorItemV2], group_schema: &Schema, aggr_schema: &Schema, aggr_row_layout: &RowLayout, @@ -232,6 +239,7 @@ fn group_aggregate_batch( // We could evaluate them after the `take`, but since we need to evaluate all // of them anyways, it is more performant to do it while they are together. let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?; + let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned); // 1.1 construct the key from the group values // 1.2 construct the mapping key if it does not exist @@ -325,8 +333,7 @@ fn group_aggregate_batch( .try_for_each(|(group_idx, offsets)| { let group_state = &mut accumulators.group_states[*group_idx]; // 2.2 - group_state - .accumulator_set + accs_v2 .iter_mut() .zip(values.iter()) .map(|(accumulator, aggr_array)| { @@ -341,11 +348,14 @@ fn group_aggregate_batch( .collect::>(), ) }) - .try_for_each(|(accumulator, values)| match mode { - AggregateMode::Partial => accumulator.update_batch(&values), - AggregateMode::FinalPartitioned | AggregateMode::Final => { - // note: the aggregation here is over states, not values, thus the merge - accumulator.merge_batch(&values) + .try_for_each(|(accumulator, values)| { + state_accessor.point_to(0, group_state.aggregation_buffer.as_mut_slice()); + match mode { + AggregateMode::Partial => accumulator.update_batch(&values, &mut state_accessor), + AggregateMode::FinalPartitioned | AggregateMode::Final => { + // note: the aggregation here is over states, not values, thus the merge + accumulator.merge_batch(&values, &mut state_accessor) + } } }) // 2.5 @@ -410,3 +420,7 @@ fn create_group_rows(arrays: Vec, schema: &Schema) -> Vec> { } results } + +fn create_state_accessor(schema: &Schema) -> RowAccessor { + RowAccessor:: +} diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index a917af3993732..10acde5170381 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -119,6 +119,13 @@ impl AggregateExpr for Avg { | DataType::Float64 ) } + + fn create_accumulator_v2(&self, start_index: usize) -> Result> { + Ok(Box::new(AvgRowAccumulator::new( + start_index, + self.data_type.clone(), + ))) + } } /// An accumulator to compute the average diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs index b3210900fca6c..b66b98470760e 100644 --- a/datafusion/physical-expr/src/aggregate/count.rs +++ b/datafusion/physical-expr/src/aggregate/count.rs @@ -99,6 +99,10 @@ impl AggregateExpr for Count { fn row_state_supported(&self) -> bool { true } + + fn create_accumulator_v2(&self, start_index: usize) -> Result> { + Ok(Box::new(CountRowAccumulator::new(start_index))) + } } #[derive(Debug)] diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 55ab77cbd647c..9abbea062efda 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -129,6 +129,10 @@ impl AggregateExpr for Max { | DataType::Float64 ) } + + fn create_accumulator_v2(&self, start_index: usize) -> Result> { + Ok(Box::new(MaxRowAccumulator::new(start_index))) + } } // Statically-typed version of min/max(array) -> ScalarValue for string types. @@ -642,6 +646,10 @@ impl AggregateExpr for Min { | DataType::Float64 ) } + + fn create_accumulator_v2(&self, start_index: usize) -> Result> { + Ok(Box::new(MinRowAccumulator::new(start_index))) + } } /// An accumulator to compute the minimum value diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 13bef9018e5a9..411638a43ced2 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -22,6 +22,7 @@ use datafusion_expr::Accumulator; use std::any::Any; use std::fmt::Debug; use std::sync::Arc; +use crate::aggregate::row_accumulator::RowAccumulator; pub(crate) mod approx_distinct; pub(crate) mod approx_median; @@ -81,4 +82,8 @@ pub trait AggregateExpr: Send + Sync + Debug { fn row_state_supported(&self) -> bool { false } + + fn create_accumulator_v2(&self, _start_index: usize) -> Result> { + unreachable!() + } } diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs index 82c34df3786f5..431a377871d81 100644 --- a/datafusion/physical-expr/src/aggregate/sum.rs +++ b/datafusion/physical-expr/src/aggregate/sum.rs @@ -114,6 +114,10 @@ impl AggregateExpr for Sum { | DataType::Float64 ) } + + fn create_accumulator_v2(&self, start_index: usize) -> Result> { + Ok(Box::new(SumRowAccumulator::new(start_index, self.data_type.clone()))) + } } #[derive(Debug)] From 2dd2d16a692b5068007d5c93d86821cfe6ef0ecb Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Fri, 29 Apr 2022 19:04:42 +0800 Subject: [PATCH 4/8] output as record batch --- .../core/src/physical_plan/aggregates/mod.rs | 10 +- .../src/physical_plan/aggregates/row_hash.rs | 104 +++++++++++++--- .../{row_accumulator.rs => accumulator_v2.rs} | 7 +- .../physical-expr/src/aggregate/average.rs | 30 +++-- .../physical-expr/src/aggregate/count.rs | 21 ++-- .../physical-expr/src/aggregate/min_max.rs | 84 ++++++++----- datafusion/physical-expr/src/aggregate/mod.rs | 11 +- datafusion/physical-expr/src/aggregate/sum.rs | 24 ++-- datafusion/row/src/accessor.rs | 113 ++++++++---------- datafusion/row/src/layout.rs | 2 +- datafusion/row/src/lib.rs | 10 +- datafusion/row/src/reader.rs | 2 +- 12 files changed, 274 insertions(+), 144 deletions(-) rename datafusion/physical-expr/src/aggregate/{row_accumulator.rs => accumulator_v2.rs} (85%) diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index a91fb2da5102a..45f40629b4fc0 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -45,7 +45,7 @@ mod no_grouping; mod row_hash; pub use datafusion_expr::AggregateFunction; -use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator; +use datafusion_physical_expr::aggregate::accumulator_v2::AccumulatorV2; pub use datafusion_physical_expr::expressions::create_aggregate_expr; /// Hash aggregate modes @@ -367,7 +367,7 @@ fn merge_expressions( } pub(crate) type AccumulatorItem = Box; -pub(crate) type AccumulatorItemV2 = Box; +pub(crate) type AccumulatorItemV2 = Box; fn create_accumulators( aggr_expr: &[Arc], @@ -378,10 +378,8 @@ fn create_accumulators( .collect::>>() } -fn check_accumulator_v2_supported( - aggr_expr: &[Arc] -) -> bool { - aggr_expr.iter().all(|expr| expr.row_state_supported()) +fn check_accumulator_v2_supported(aggr_expr: &[Arc]) -> bool { + aggr_expr.iter().all(|expr| expr.accumulator_v2_supported()) } fn create_accumulators_v2( diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index 943f38f9c02d0..2da8e39d42d8a 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -28,12 +28,15 @@ use futures::{ }; use crate::error::Result; -use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode, AccumulatorItemV2}; +use crate::physical_plan::aggregates::{ + evaluate, evaluate_many, AccumulatorItemV2, AggregateMode, +}; use crate::physical_plan::hash_utils::create_row_hashes; use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; +use arrow::compute::cast; use arrow::datatypes::Schema; use arrow::{array::ArrayRef, compute}; use arrow::{ @@ -41,11 +44,13 @@ use arrow::{ error::{ArrowError, Result as ArrowResult}, }; use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; +use datafusion_common::ScalarValue; +use datafusion_row::accessor::RowAccessor; use datafusion_row::layout::RowLayout; +use datafusion_row::reader::{read_row, RowReader}; use datafusion_row::writer::{write_row, RowWriter}; -use datafusion_row::RowType; +use datafusion_row::{MutableRecordBatch, RowType}; use hashbrown::raw::RawTable; -use datafusion_row::accessor::RowAccessor; /* The architecture is the following: @@ -79,7 +84,6 @@ pub(crate) struct GroupedRowHashAggregateStream { accumulators: Accumulators, aggregate_expressions: Vec>>, - aggr_expr: Vec>, group_expr: Vec>, accs_v2: Vec, @@ -131,7 +135,6 @@ impl GroupedRowHashAggregateStream { schema, mode, input, - aggr_expr, group_expr, accs_v2, group_schema, @@ -169,10 +172,8 @@ impl Stream for GroupedRowHashAggregateStream { &this.mode, &this.random_state, &this.group_expr, - &this.aggr_expr, &mut this.accs_v2, &this.group_schema, - &this.aggr_schema, &this.aggr_layout, this.aggr_buffer_width, batch, @@ -193,8 +194,10 @@ impl Stream for GroupedRowHashAggregateStream { let timer = this.baseline_metrics.elapsed_compute().timer(); let result = create_batch_from_map( &this.mode, - &this.accumulators, - this.group_expr.len(), + &this.group_schema, + &this.aggr_schema, + &mut this.accumulators, + &mut this.accs_v2, &this.schema, ) .record_output(&this.baseline_metrics); @@ -221,11 +224,9 @@ fn group_aggregate_batch( mode: &AggregateMode, random_state: &RandomState, group_expr: &[Arc], - aggr_expr: &[Arc], accs_v2: &mut [AccumulatorItemV2], group_schema: &Schema, - aggr_schema: &Schema, - aggr_row_layout: &RowLayout, + state_layout: &RowLayout, aggr_buffer_width: usize, batch: RecordBatch, accumulators: &mut Accumulators, @@ -239,7 +240,6 @@ fn group_aggregate_batch( // We could evaluate them after the `take`, but since we need to evaluate all // of them anyways, it is more performant to do it while they are together. let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?; - let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned); // 1.1 construct the key from the group values // 1.2 construct the mapping key if it does not exist @@ -349,9 +349,14 @@ fn group_aggregate_batch( ) }) .try_for_each(|(accumulator, values)| { - state_accessor.point_to(0, group_state.aggregation_buffer.as_mut_slice()); + let mut state_accessor = + RowAccessor::new_from_layout(state_layout.clone()); + state_accessor + .point_to(0, group_state.aggregation_buffer.as_mut_slice()); match mode { - AggregateMode::Partial => accumulator.update_batch(&values, &mut state_accessor), + AggregateMode::Partial => { + accumulator.update_batch(&values, &mut state_accessor) + } AggregateMode::FinalPartitioned | AggregateMode::Final => { // note: the aggregation here is over states, not values, thus the merge accumulator.merge_batch(&values, &mut state_accessor) @@ -421,6 +426,71 @@ fn create_group_rows(arrays: Vec, schema: &Schema) -> Vec> { results } -fn create_state_accessor(schema: &Schema) -> RowAccessor { - RowAccessor:: +/// Create a RecordBatch with all group keys and accumulator' states or values. +fn create_batch_from_map( + mode: &AggregateMode, + group_schema: &Schema, + aggr_schema: &Schema, + accumulators: &mut Accumulators, + accs_v2: &mut [AccumulatorItemV2], + output_schema: &Schema, +) -> ArrowResult { + if accumulators.group_states.is_empty() { + return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned()))); + } + + let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned); + + let (group_buffers, mut state_buffers): (Vec<_>, Vec<_>) = accumulators + .group_states + .iter() + .map(|gs| (gs.group_by_values.clone(), gs.aggregation_buffer.clone())) + .unzip(); + + let mut columns: Vec = + read_as_batch(&group_buffers, group_schema, RowType::Compact); + + match mode { + AggregateMode::Partial => columns.extend(read_as_batch( + &state_buffers, + aggr_schema, + RowType::WordAligned, + )), + AggregateMode::Final | AggregateMode::FinalPartitioned => { + let mut results: Vec> = vec![vec![]; accs_v2.len()]; + for buffer in state_buffers.iter_mut() { + state_accessor.point_to(0, buffer); + for (i, acc) in accs_v2.iter().enumerate() { + results[i].push(acc.evaluate(&state_accessor).unwrap()); + } + } + for scalars in results { + columns.push(ScalarValue::iter_to_array(scalars)?); + } + } + } + + // cast output if needed (e.g. for types like Dictionary where + // the intermediate GroupByScalar type was not the same as the + // output + let columns = columns + .iter() + .zip(output_schema.fields().iter()) + .map(|(col, desired_field)| cast(col, desired_field.data_type())) + .collect::>>()?; + + RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns) +} + +fn read_as_batch(rows: &[Vec], schema: &Schema, row_type: RowType) -> Vec { + let row_num = rows.len(); + let mut output = MutableRecordBatch::new(row_num, Arc::new(schema.clone())); + let mut row = RowReader::new(&schema, row_type); + + for data in rows { + row.point_to(0, data); + read_row(&row, &mut output, schema); + } + + output.output_as_columns() } diff --git a/datafusion/physical-expr/src/aggregate/row_accumulator.rs b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs similarity index 85% rename from datafusion/physical-expr/src/aggregate/row_accumulator.rs rename to datafusion/physical-expr/src/aggregate/accumulator_v2.rs index a0024f2eb3f22..dc8345064dcff 100644 --- a/datafusion/physical-expr/src/aggregate/row_accumulator.rs +++ b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs @@ -18,11 +18,11 @@ //! Accumulator in raw format use arrow::array::ArrayRef; -use datafusion_common::Result; +use datafusion_common::{Result, ScalarValue}; use datafusion_row::accessor::RowAccessor; use std::fmt::Debug; -pub trait RowAccumulator: Send + Sync + Debug { +pub trait AccumulatorV2: Send + Sync + Debug { /// updates the accumulator's state from a vector of arrays. fn update_batch( &mut self, @@ -36,4 +36,7 @@ pub trait RowAccumulator: Send + Sync + Debug { states: &[ArrayRef], accessor: &mut RowAccessor, ) -> Result<()>; + + /// returns its value based on its current state. + fn evaluate(&self, accessor: &RowAccessor) -> Result; } diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index 10acde5170381..c95d8a07b211c 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; -use crate::aggregate::row_accumulator::RowAccumulator; +use crate::aggregate::accumulator_v2::AccumulatorV2; use crate::aggregate::sum; use crate::expressions::format_state_name; use crate::{AggregateExpr, PhysicalExpr}; @@ -104,7 +104,7 @@ impl AggregateExpr for Avg { &self.name } - fn row_state_supported(&self) -> bool { + fn accumulator_v2_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -120,8 +120,11 @@ impl AggregateExpr for Avg { ) } - fn create_accumulator_v2(&self, start_index: usize) -> Result> { - Ok(Box::new(AvgRowAccumulator::new( + fn create_accumulator_v2( + &self, + start_index: usize, + ) -> Result> { + Ok(Box::new(AvgAccumulatorV2::new( start_index, self.data_type.clone(), ))) @@ -193,12 +196,12 @@ impl Accumulator for AvgAccumulator { } #[derive(Debug)] -struct AvgRowAccumulator { +struct AvgAccumulatorV2 { start_index: usize, sum_datatype: DataType, } -impl AvgRowAccumulator { +impl AvgAccumulatorV2 { pub fn new(start_index: usize, sum_datatype: DataType) -> Self { Self { start_index, @@ -207,7 +210,7 @@ impl AvgRowAccumulator { } } -impl RowAccumulator for AvgRowAccumulator { +impl AccumulatorV2 for AvgAccumulatorV2 { fn update_batch( &mut self, values: &[ArrayRef], @@ -243,6 +246,19 @@ impl RowAccumulator for AvgRowAccumulator { )?; Ok(()) } + + fn evaluate(&self, accessor: &RowAccessor) -> Result { + assert_eq!(self.sum_datatype, DataType::Float64); + Ok(match accessor.get_u64_opt(self.start_index) { + None => ScalarValue::Float64(None), + Some(0) => ScalarValue::Float64(Some(0.0)), + Some(n) => ScalarValue::Float64( + accessor + .get_f64_opt(self.start_index + 1) + .map(|f| f / n as f64), + ), + }) + } } #[cfg(test)] diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs index b66b98470760e..9ccd13d5753ed 100644 --- a/datafusion/physical-expr/src/aggregate/count.rs +++ b/datafusion/physical-expr/src/aggregate/count.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -use crate::aggregate::row_accumulator::RowAccumulator; +use crate::aggregate::accumulator_v2::AccumulatorV2; use crate::{AggregateExpr, PhysicalExpr}; use arrow::compute; use arrow::datatypes::DataType; @@ -96,12 +96,15 @@ impl AggregateExpr for Count { &self.name } - fn row_state_supported(&self) -> bool { + fn accumulator_v2_supported(&self) -> bool { true } - fn create_accumulator_v2(&self, start_index: usize) -> Result> { - Ok(Box::new(CountRowAccumulator::new(start_index))) + fn create_accumulator_v2( + &self, + start_index: usize, + ) -> Result> { + Ok(Box::new(CountAccumulatorV2::new(start_index))) } } @@ -143,17 +146,17 @@ impl Accumulator for CountAccumulator { } #[derive(Debug)] -struct CountRowAccumulator { +struct CountAccumulatorV2 { index: usize, } -impl CountRowAccumulator { +impl CountAccumulatorV2 { pub fn new(index: usize) -> Self { Self { index } } } -impl RowAccumulator for CountRowAccumulator { +impl AccumulatorV2 for CountAccumulatorV2 { fn update_batch( &mut self, values: &[ArrayRef], @@ -177,6 +180,10 @@ impl RowAccumulator for CountRowAccumulator { } Ok(()) } + + fn evaluate(&self, accessor: &RowAccessor) -> Result { + Ok(accessor.get_as_scalar(&DataType::UInt64, self.index)) + } } #[cfg(test)] diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 9abbea062efda..2062d16c71245 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -37,7 +37,7 @@ use datafusion_common::ScalarValue; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::Accumulator; -use crate::aggregate::row_accumulator::RowAccumulator; +use crate::aggregate::accumulator_v2::AccumulatorV2; use crate::expressions::format_state_name; use arrow::array::Array; use arrow::array::DecimalArray; @@ -114,7 +114,7 @@ impl AggregateExpr for Max { &self.name } - fn row_state_supported(&self) -> bool { + fn accumulator_v2_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -130,8 +130,14 @@ impl AggregateExpr for Max { ) } - fn create_accumulator_v2(&self, start_index: usize) -> Result> { - Ok(Box::new(MaxRowAccumulator::new(start_index))) + fn create_accumulator_v2( + &self, + start_index: usize, + ) -> Result> { + Ok(Box::new(MaxAccumulatorV2::new( + start_index, + self.data_type.clone(), + ))) } } @@ -326,7 +332,7 @@ macro_rules! typed_min_max { } // min/max of two non-string scalar values. -macro_rules! typed_min_max_row { +macro_rules! typed_min_max_v2 { ($INDEX:ident, $ACC:ident, $SCALAR:expr, $TYPE:ident, $OP:ident) => {{ paste::item! { match $SCALAR { @@ -443,38 +449,38 @@ macro_rules! min_max { } // min/max of two scalar values of the same type -macro_rules! min_max_row { +macro_rules! min_max_v2 { ($INDEX:ident, $ACC:ident, $SCALAR:expr, $OP:ident) => {{ Ok(match $SCALAR { ScalarValue::Float64(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, f64, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, f64, $OP) } ScalarValue::Float32(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, f32, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, f32, $OP) } ScalarValue::UInt64(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, u64, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, u64, $OP) } ScalarValue::UInt32(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, u32, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, u32, $OP) } ScalarValue::UInt16(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, u16, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, u16, $OP) } ScalarValue::UInt8(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, u8, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, u8, $OP) } ScalarValue::Int64(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, i64, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, i64, $OP) } ScalarValue::Int32(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, i32, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, i32, $OP) } ScalarValue::Int16(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, i16, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, i16, $OP) } ScalarValue::Int8(rhs) => { - typed_min_max_row!($INDEX, $ACC, rhs, i8, $OP) + typed_min_max_v2!($INDEX, $ACC, rhs, i8, $OP) } e => { return Err(DataFusionError::Internal(format!( @@ -492,7 +498,7 @@ pub fn min(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { } pub fn min_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> { - min_max_row!(index, accessor, s, min) + min_max_v2!(index, accessor, s, min) } /// the maximum of two scalar values @@ -501,7 +507,7 @@ pub fn max(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { } pub fn max_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> { - min_max_row!(index, accessor, s, max) + min_max_v2!(index, accessor, s, max) } /// An accumulator to compute the maximum value @@ -541,17 +547,18 @@ impl Accumulator for MaxAccumulator { } #[derive(Debug)] -struct MaxRowAccumulator { +struct MaxAccumulatorV2 { index: usize, + data_type: DataType, } -impl MaxRowAccumulator { - pub fn new(index: usize) -> Self { - Self { index } +impl MaxAccumulatorV2 { + pub fn new(index: usize, data_type: DataType) -> Self { + Self { index, data_type } } } -impl RowAccumulator for MaxRowAccumulator { +impl AccumulatorV2 for MaxAccumulatorV2 { fn update_batch( &mut self, values: &[ArrayRef], @@ -570,6 +577,10 @@ impl RowAccumulator for MaxRowAccumulator { ) -> Result<()> { self.update_batch(states, accessor) } + + fn evaluate(&self, accessor: &RowAccessor) -> Result { + Ok(accessor.get_as_scalar(&self.data_type, self.index)) + } } /// MIN aggregate expression @@ -631,7 +642,7 @@ impl AggregateExpr for Min { &self.name } - fn row_state_supported(&self) -> bool { + fn accumulator_v2_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -647,8 +658,14 @@ impl AggregateExpr for Min { ) } - fn create_accumulator_v2(&self, start_index: usize) -> Result> { - Ok(Box::new(MinRowAccumulator::new(start_index))) + fn create_accumulator_v2( + &self, + start_index: usize, + ) -> Result> { + Ok(Box::new(MinAccumulatorV2::new( + start_index, + self.data_type.clone(), + ))) } } @@ -689,17 +706,18 @@ impl Accumulator for MinAccumulator { } #[derive(Debug)] -struct MinRowAccumulator { +struct MinAccumulatorV2 { index: usize, + data_type: DataType, } -impl MinRowAccumulator { - pub fn new(index: usize) -> Self { - Self { index } +impl MinAccumulatorV2 { + pub fn new(index: usize, data_type: DataType) -> Self { + Self { index, data_type } } } -impl RowAccumulator for MinRowAccumulator { +impl AccumulatorV2 for MinAccumulatorV2 { fn update_batch( &mut self, values: &[ArrayRef], @@ -718,6 +736,10 @@ impl RowAccumulator for MinRowAccumulator { ) -> Result<()> { self.update_batch(states, accessor) } + + fn evaluate(&self, accessor: &RowAccessor) -> Result { + Ok(accessor.get_as_scalar(&self.data_type, self.index)) + } } #[cfg(test)] diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 411638a43ced2..327c40ad29c24 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aggregate::accumulator_v2::AccumulatorV2; use crate::PhysicalExpr; use arrow::datatypes::Field; use datafusion_common::Result; @@ -22,7 +23,6 @@ use datafusion_expr::Accumulator; use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -use crate::aggregate::row_accumulator::RowAccumulator; pub(crate) mod approx_distinct; pub(crate) mod approx_median; @@ -37,9 +37,9 @@ pub(crate) mod covariance; pub(crate) mod distinct_expressions; #[macro_use] pub(crate) mod min_max; +pub mod accumulator_v2; pub mod build_in; mod hyperloglog; -pub mod row_accumulator; pub(crate) mod stats; pub(crate) mod stddev; pub(crate) mod sum; @@ -79,11 +79,14 @@ pub trait AggregateExpr: Send + Sync + Debug { } /// If the aggregate expression is supported by row format - fn row_state_supported(&self) -> bool { + fn accumulator_v2_supported(&self) -> bool { false } - fn create_accumulator_v2(&self, _start_index: usize) -> Result> { + fn create_accumulator_v2( + &self, + _start_index: usize, + ) -> Result> { unreachable!() } } diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs index 431a377871d81..e0ed2ffc72fd7 100644 --- a/datafusion/physical-expr/src/aggregate/sum.rs +++ b/datafusion/physical-expr/src/aggregate/sum.rs @@ -34,7 +34,7 @@ use arrow::{ use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::Accumulator; -use crate::aggregate::row_accumulator::RowAccumulator; +use crate::aggregate::accumulator_v2::AccumulatorV2; use crate::expressions::format_state_name; use arrow::array::Array; use arrow::array::DecimalArray; @@ -99,7 +99,7 @@ impl AggregateExpr for Sum { &self.name } - fn row_state_supported(&self) -> bool { + fn accumulator_v2_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -115,8 +115,14 @@ impl AggregateExpr for Sum { ) } - fn create_accumulator_v2(&self, start_index: usize) -> Result> { - Ok(Box::new(SumRowAccumulator::new(start_index, self.data_type.clone()))) + fn create_accumulator_v2( + &self, + start_index: usize, + ) -> Result> { + Ok(Box::new(SumAccumulatorV2::new( + start_index, + self.data_type.clone(), + ))) } } @@ -441,18 +447,18 @@ impl Accumulator for SumAccumulator { } #[derive(Debug)] -struct SumRowAccumulator { +struct SumAccumulatorV2 { index: usize, datatype: DataType, } -impl SumRowAccumulator { +impl SumAccumulatorV2 { pub fn new(index: usize, datatype: DataType) -> Self { Self { index, datatype } } } -impl RowAccumulator for SumRowAccumulator { +impl AccumulatorV2 for SumAccumulatorV2 { fn update_batch( &mut self, values: &[ArrayRef], @@ -470,6 +476,10 @@ impl RowAccumulator for SumRowAccumulator { ) -> Result<()> { self.update_batch(states, accessor) } + + fn evaluate(&self, accessor: &RowAccessor) -> Result { + Ok(accessor.get_as_scalar(&self.datatype, self.index)) + } } #[cfg(test)] diff --git a/datafusion/row/src/accessor.rs b/datafusion/row/src/accessor.rs index 7b204fe85b7c2..ae5c74b701352 100644 --- a/datafusion/row/src/accessor.rs +++ b/datafusion/row/src/accessor.rs @@ -18,10 +18,11 @@ //! Setter/Getter for row with all fixed-sized fields. use crate::layout::{RowLayout, RowType}; -use crate::validity::{all_valid, NullBitsFormatter}; -use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx, get_idx, set_idx}; -use arrow::datatypes::Schema; -use arrow::util::bit_util::{get_bit_raw, set_bit_raw, unset_bit_raw}; +use crate::validity::NullBitsFormatter; +use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx}; +use arrow::datatypes::{DataType, Schema}; +use arrow::util::bit_util::{get_bit_raw, set_bit_raw}; +use datafusion_common::ScalarValue; //TODO: DRY with reader and writer @@ -84,6 +85,20 @@ macro_rules! fn_max_min_idx { }; } +macro_rules! fn_get_idx_scalar { + ($NATIVE: ident, $SCALAR:ident) => { + paste::item! { + pub fn [](&self, idx: usize) -> ScalarValue { + if self.is_valid_at(idx) { + ScalarValue::$SCALAR(Some(self.[](idx))) + } else { + ScalarValue::$SCALAR(None) + } + } + } + }; +} + impl<'a> RowAccessor<'a> { /// new pub fn new(schema: &Schema, row_type: RowType) -> Self { @@ -94,6 +109,14 @@ impl<'a> RowAccessor<'a> { } } + pub fn new_from_layout(layout: RowLayout) -> Self { + Self { + layout, + data: &mut [], + base_offset: 0, + } + } + /// Update this row to point to position `offset` in `base` pub fn point_to(&mut self, offset: usize, data: &'a mut [u8]) { self.base_offset = offset; @@ -125,16 +148,6 @@ impl<'a> RowAccessor<'a> { } } - #[inline(always)] - fn all_valid(&self) -> bool { - if self.null_free() { - true - } else { - let null_bits = self.null_bits(); - all_valid(null_bits, self.layout.field_count) - } - } - fn is_valid_at(&self, idx: usize) -> bool { unsafe { get_bit_raw(self.null_bits().as_ptr(), idx) } } @@ -166,14 +179,6 @@ impl<'a> RowAccessor<'a> { fn_get_idx!(f32, 4); fn_get_idx!(f64, 8); - fn get_date32(&self, idx: usize) -> i32 { - get_idx!(i32, self, idx, 4) - } - - fn get_date64(&self, idx: usize) -> i64 { - get_idx!(i64, self, idx, 8) - } - fn_get_idx_opt!(bool); fn_get_idx_opt!(u8); fn_get_idx_opt!(u16); @@ -186,19 +191,32 @@ impl<'a> RowAccessor<'a> { fn_get_idx_opt!(f32); fn_get_idx_opt!(f64); - fn get_date32_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_date32(idx)) - } else { - None - } - } - - fn get_date64_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_date64(idx)) - } else { - None + fn_get_idx_scalar!(bool, Boolean); + fn_get_idx_scalar!(u8, UInt8); + fn_get_idx_scalar!(u16, UInt16); + fn_get_idx_scalar!(u32, UInt32); + fn_get_idx_scalar!(u64, UInt64); + fn_get_idx_scalar!(i8, Int8); + fn_get_idx_scalar!(i16, Int16); + fn_get_idx_scalar!(i32, Int32); + fn_get_idx_scalar!(i64, Int64); + fn_get_idx_scalar!(f32, Float32); + fn_get_idx_scalar!(f64, Float64); + + pub fn get_as_scalar(&self, dt: &DataType, index: usize) -> ScalarValue { + match dt { + DataType::Boolean => self.get_bool_scalar(index), + DataType::Int8 => self.get_i8_scalar(index), + DataType::Int16 => self.get_i16_scalar(index), + DataType::Int32 => self.get_i32_scalar(index), + DataType::Int64 => self.get_i64_scalar(index), + DataType::UInt8 => self.get_u8_scalar(index), + DataType::UInt16 => self.get_u16_scalar(index), + DataType::UInt32 => self.get_u32_scalar(index), + DataType::UInt64 => self.get_u64_scalar(index), + DataType::Float32 => self.get_f32_scalar(index), + DataType::Float64 => self.get_f64_scalar(index), + _ => unreachable!(), } } @@ -206,17 +224,6 @@ impl<'a> RowAccessor<'a> { // ----- Fixed Sized setters ---- // ------------------------------ - pub(crate) fn set_null_at(&mut self, idx: usize) { - assert!( - !self.null_free(), - "Unexpected call to set_null_at on null-free row writer" - ); - let null_bits = &mut self.data[0..self.layout.null_width]; - unsafe { - unset_bit_raw(null_bits.as_mut_ptr(), idx); - } - } - pub(crate) fn set_non_null_at(&mut self, idx: usize) { assert!( !self.null_free(), @@ -228,12 +235,6 @@ impl<'a> RowAccessor<'a> { } } - fn set_bool(&mut self, idx: usize, value: bool) { - self.assert_index_valid(idx); - let offset = self.field_offsets()[idx]; - self.data[offset] = if value { 1 } else { 0 }; - } - fn set_u8(&mut self, idx: usize, value: u8) { self.assert_index_valid(idx); let offset = self.field_offsets()[idx]; @@ -255,14 +256,6 @@ impl<'a> RowAccessor<'a> { self.data[offset] = value.to_le_bytes()[0]; } - fn set_date32(&mut self, idx: usize, value: i32) { - set_idx!(4, self, idx, value) - } - - fn set_date64(&mut self, idx: usize, value: i64) { - set_idx!(8, self, idx, value) - } - // ------------------------------ // ---- Fixed sized updaters ---- // ------------------------------ diff --git a/datafusion/row/src/layout.rs b/datafusion/row/src/layout.rs index 2c4c15da5a09e..adbe67ea52df9 100644 --- a/datafusion/row/src/layout.rs +++ b/datafusion/row/src/layout.rs @@ -38,7 +38,7 @@ pub enum RowType { } /// Reveals how the fields of a record are stored in the raw-bytes format -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct RowLayout { /// Type of the layout row_type: RowType, diff --git a/datafusion/row/src/lib.rs b/datafusion/row/src/lib.rs index f954b16bc36cd..c31bf751a1190 100644 --- a/datafusion/row/src/lib.rs +++ b/datafusion/row/src/lib.rs @@ -47,7 +47,7 @@ //! 0 1 2 10 14 22 31 32 //! -use arrow::array::{make_builder, ArrayBuilder}; +use arrow::array::{make_builder, ArrayBuilder, ArrayRef}; use arrow::datatypes::Schema; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; @@ -85,6 +85,10 @@ impl MutableRecordBatch { let result = make_batch(self.schema.clone(), self.arrays.drain(..).collect()); result } + + pub fn output_as_columns(&mut self) -> Vec { + get_columns(self.arrays.drain(..).collect()) + } } fn new_arrays(schema: &Schema, batch_size: usize) -> Vec> { @@ -106,6 +110,10 @@ fn make_batch( RecordBatch::try_new(schema, columns) } +fn get_columns(mut arrays: Vec>) -> Vec { + arrays.iter_mut().map(|array| array.finish()).collect() +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/row/src/reader.rs b/datafusion/row/src/reader.rs index 77e9a552cbf84..1bf6e102a9f2c 100644 --- a/datafusion/row/src/reader.rs +++ b/datafusion/row/src/reader.rs @@ -76,7 +76,7 @@ macro_rules! fn_get_idx { macro_rules! fn_get_idx_opt { ($NATIVE: ident) => { paste::item! { - fn [](&self, idx: usize) -> Option<$NATIVE> { + pub fn [](&self, idx: usize) -> Option<$NATIVE> { if self.is_valid_at(idx) { Some(self.[](idx)) } else { From 430c31577a1790df6512ebe8b9d01ef7696b224d Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Thu, 5 May 2022 14:45:59 +0800 Subject: [PATCH 5/8] aggregate with row state --- .../core/src/physical_plan/aggregates/mod.rs | 24 +++- .../src/physical_plan/aggregates/row_hash.rs | 114 +++++++----------- .../core/src/physical_plan/hash_utils.rs | 6 +- .../src/aggregate/accumulator_v2.rs | 2 +- .../physical-expr/src/aggregate/average.rs | 6 +- datafusion/row/src/layout.rs | 9 +- datafusion/row/src/lib.rs | 7 +- 7 files changed, 88 insertions(+), 80 deletions(-) diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index abb68d8010367..8e6f0c4c1b44d 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -43,9 +43,11 @@ mod hash; mod no_grouping; mod row_hash; +use crate::physical_plan::aggregates::row_hash::GroupedHashAggregateStreamV2; pub use datafusion_expr::AggregateFunction; use datafusion_physical_expr::aggregate::accumulator_v2::AccumulatorV2; pub use datafusion_physical_expr::expressions::create_aggregate_expr; +use datafusion_row::{row_supported, RowType}; /// Hash aggregate modes #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -145,6 +147,12 @@ impl AggregateExec { pub fn input_schema(&self) -> SchemaRef { self.input_schema.clone() } + + fn row_aggregate_supported(&self) -> bool { + let group_schema = group_schema(&self.schema, self.group_expr.len()); + row_supported(&group_schema, RowType::Compact) + && accumulator_v2_supported(&self.aggr_expr) + } } impl ExecutionPlan for AggregateExec { @@ -215,6 +223,15 @@ impl ExecutionPlan for AggregateExec { input, baseline_metrics, )?)) + } else if self.row_aggregate_supported() { + Ok(Box::pin(GroupedHashAggregateStreamV2::new( + self.mode, + self.schema.clone(), + group_expr, + self.aggr_expr.clone(), + input, + baseline_metrics, + )?)) } else { Ok(Box::pin(GroupedHashAggregateStream::new( self.mode, @@ -318,6 +335,11 @@ fn create_schema( Ok(Schema::new(fields)) } +fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef { + let group_fields = schema.fields()[0..group_count].to_vec(); + Arc::new(Schema::new(group_fields)) +} + /// returns physical expressions to evaluate against a batch /// The expressions are different depending on `mode`: /// * Partial: AggregateExpr::expressions @@ -376,7 +398,7 @@ fn create_accumulators( .collect::>>() } -fn check_accumulator_v2_supported(aggr_expr: &[Arc]) -> bool { +fn accumulator_v2_supported(aggr_expr: &[Arc]) -> bool { aggr_expr.iter().all(|expr| expr.accumulator_v2_supported()) } diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index 2da8e39d42d8a..0ee46c880914b 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -29,7 +29,7 @@ use futures::{ use crate::error::Result; use crate::physical_plan::aggregates::{ - evaluate, evaluate_many, AccumulatorItemV2, AggregateMode, + evaluate, evaluate_many, group_schema, AccumulatorItemV2, AggregateMode, }; use crate::physical_plan::hash_utils::create_row_hashes; use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; @@ -52,60 +52,38 @@ use datafusion_row::writer::{write_row, RowWriter}; use datafusion_row::{MutableRecordBatch, RowType}; use hashbrown::raw::RawTable; -/* -The architecture is the following: - -1. An accumulator has state that is updated on each batch. -2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row -3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch. -4. The state's RecordBatch is `merge`d to a new state -5. The state is mapped to the final value - -Why: - -* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array` -* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge -* It uses Arrow's native dynamically typed object, `Array`. -* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant. - -Example: average - -* the state is `n: u32` and `sum: f64` -* For every batch, we update them accordingly. -* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]` -* The RecordBatch is (sent back / transmitted over network) -* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns. -* Finally, `get_value` returns an array with one entry computed from the state -*/ -pub(crate) struct GroupedRowHashAggregateStream { +/// Grouping aggregate with row format to store the aggregation state. +/// +/// The Architecture is similar to that in [`super::GroupedHashAggregateStream`] but use +/// row format inside the HashTable to store aggregation buffers. +pub(crate) struct GroupedHashAggregateStreamV2 { schema: SchemaRef, input: SendableRecordBatchStream, mode: AggregateMode, - accumulators: Accumulators, + aggr_state: AggregationState, aggregate_expressions: Vec>>, group_expr: Vec>, - accs_v2: Vec, + accumulators: Vec, group_schema: SchemaRef, aggr_schema: SchemaRef, aggr_layout: RowLayout, - aggr_buffer_width: usize, baseline_metrics: BaselineMetrics, random_state: RandomState, finished: bool, } -fn create_separate_schema(schema: &Schema, group_count: usize) -> (SchemaRef, SchemaRef) { - let (group_fields, aggr_fields) = schema.fields().split_at(group_count); - ( - Arc::new(Schema::new(group_fields.to_vec())), - Arc::new(Schema::new(aggr_fields.to_vec())), - ) +fn aggr_state_schema(aggr_expr: &[Arc]) -> Result { + let fields = aggr_expr + .iter() + .flat_map(|expr| expr.state_fields().unwrap().into_iter()) + .collect::>(); + Ok(Arc::new(Schema::new(fields))) } -impl GroupedRowHashAggregateStream { +impl GroupedHashAggregateStreamV2 { /// Create a new GroupedRowHashAggregateStream pub fn new( mode: AggregateMode, @@ -123,12 +101,12 @@ impl GroupedRowHashAggregateStream { let aggregate_expressions = aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?; - let accs_v2 = aggregates::create_accumulators_v2(&aggr_expr)?; + let accumulators = aggregates::create_accumulators_v2(&aggr_expr)?; + + let group_schema = group_schema(&schema, group_expr.len()); + let aggr_schema = aggr_state_schema(&aggr_expr)?; - let (group_schema, aggr_schema) = - create_separate_schema(&schema, group_expr.len()); let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned); - let aggr_buffer_width = aggr_layout.fixed_part_width(); timer.done(); Ok(Self { @@ -136,21 +114,20 @@ impl GroupedRowHashAggregateStream { mode, input, group_expr, - accs_v2, + accumulators, group_schema, aggr_schema, aggr_layout, - aggr_buffer_width, baseline_metrics, aggregate_expressions, - accumulators: Default::default(), + aggr_state: Default::default(), random_state: Default::default(), finished: false, }) } } -impl Stream for GroupedRowHashAggregateStream { +impl Stream for GroupedHashAggregateStreamV2 { type Item = ArrowResult; fn poll_next( @@ -172,12 +149,11 @@ impl Stream for GroupedRowHashAggregateStream { &this.mode, &this.random_state, &this.group_expr, - &mut this.accs_v2, + &mut this.accumulators, &this.group_schema, &this.aggr_layout, - this.aggr_buffer_width, batch, - &mut this.accumulators, + &mut this.aggr_state, &this.aggregate_expressions, ); @@ -196,8 +172,8 @@ impl Stream for GroupedRowHashAggregateStream { &this.mode, &this.group_schema, &this.aggr_schema, + &mut this.aggr_state, &mut this.accumulators, - &mut this.accs_v2, &this.schema, ) .record_output(&this.baseline_metrics); @@ -213,23 +189,23 @@ impl Stream for GroupedRowHashAggregateStream { } } -impl RecordBatchStream for GroupedRowHashAggregateStream { +impl RecordBatchStream for GroupedHashAggregateStreamV2 { fn schema(&self) -> SchemaRef { self.schema.clone() } } -/// TODO: Make this a member function of [`GroupedRowHashAggregateStream`] +/// TODO: Make this a member function of [`GroupedHashAggregateStreamV2`] +#[allow(clippy::too_many_arguments)] fn group_aggregate_batch( mode: &AggregateMode, random_state: &RandomState, group_expr: &[Arc], - accs_v2: &mut [AccumulatorItemV2], + accumulators: &mut [AccumulatorItemV2], group_schema: &Schema, state_layout: &RowLayout, - aggr_buffer_width: usize, batch: RecordBatch, - accumulators: &mut Accumulators, + aggr_state: &mut AggregationState, aggregate_expressions: &[Vec>], ) -> Result<()> { // evaluate the grouping expressions @@ -245,7 +221,7 @@ fn group_aggregate_batch( // 1.2 construct the mapping key if it does not exist // 1.3 add the row' index to `indices` - // track which entries in `accumulators` have rows in this batch to aggregate + // track which entries in `aggr_state` have rows in this batch to aggregate let mut groups_with_rows = vec![]; // 1.1 Calculate the group keys for the group values @@ -253,7 +229,7 @@ fn group_aggregate_batch( create_row_hashes(&group_rows, random_state, &mut batch_hashes)?; for (row, hash) in batch_hashes.into_iter().enumerate() { - let Accumulators { map, group_states } = accumulators; + let AggregationState { map, group_states } = aggr_state; let entry = map.get_mut(hash, |(_hash, group_idx)| { // verify that a group that we are inserting with hash is @@ -278,7 +254,7 @@ fn group_aggregate_batch( // Add new entry to group_states and save newly created index let group_state = RowGroupState { group_by_values: group_rows[row].clone(), - aggregation_buffer: Vec::with_capacity(aggr_buffer_width), + aggregation_buffer: vec![0; state_layout.fixed_part_width()], indices: vec![row as u32], // 1.3 }; let group_idx = group_states.len(); @@ -296,7 +272,7 @@ fn group_aggregate_batch( let mut offsets = vec![0]; let mut offset_so_far = 0; for group_idx in groups_with_rows.iter() { - let indices = &accumulators.group_states[*group_idx].indices; + let indices = &aggr_state.group_states[*group_idx].indices; batch_indices.append_slice(indices)?; offset_so_far += indices.len(); offsets.push(offset_so_far); @@ -331,9 +307,9 @@ fn group_aggregate_batch( .iter() .zip(offsets.windows(2)) .try_for_each(|(group_idx, offsets)| { - let group_state = &mut accumulators.group_states[*group_idx]; + let group_state = &mut aggr_state.group_states[*group_idx]; // 2.2 - accs_v2 + accumulators .iter_mut() .zip(values.iter()) .map(|(accumulator, aggr_array)| { @@ -389,7 +365,7 @@ struct RowGroupState { /// The state of all the groups #[derive(Default)] -struct Accumulators { +struct AggregationState { /// Logically maps group values to an index in `group_states` /// /// Uses the raw API of hashbrown to avoid actually storing the @@ -403,7 +379,7 @@ struct Accumulators { group_states: Vec, } -impl std::fmt::Debug for Accumulators { +impl std::fmt::Debug for AggregationState { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { // hashes are not store inline, so could only get values let map_string = "RawTable"; @@ -431,17 +407,17 @@ fn create_batch_from_map( mode: &AggregateMode, group_schema: &Schema, aggr_schema: &Schema, - accumulators: &mut Accumulators, - accs_v2: &mut [AccumulatorItemV2], + aggr_state: &mut AggregationState, + accumulators: &mut [AccumulatorItemV2], output_schema: &Schema, ) -> ArrowResult { - if accumulators.group_states.is_empty() { + if aggr_state.group_states.is_empty() { return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned()))); } let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned); - let (group_buffers, mut state_buffers): (Vec<_>, Vec<_>) = accumulators + let (group_buffers, mut state_buffers): (Vec<_>, Vec<_>) = aggr_state .group_states .iter() .map(|gs| (gs.group_by_values.clone(), gs.aggregation_buffer.clone())) @@ -457,10 +433,10 @@ fn create_batch_from_map( RowType::WordAligned, )), AggregateMode::Final | AggregateMode::FinalPartitioned => { - let mut results: Vec> = vec![vec![]; accs_v2.len()]; + let mut results: Vec> = vec![vec![]; accumulators.len()]; for buffer in state_buffers.iter_mut() { state_accessor.point_to(0, buffer); - for (i, acc) in accs_v2.iter().enumerate() { + for (i, acc) in accumulators.iter().enumerate() { results[i].push(acc.evaluate(&state_accessor).unwrap()); } } @@ -485,7 +461,7 @@ fn create_batch_from_map( fn read_as_batch(rows: &[Vec], schema: &Schema, row_type: RowType) -> Vec { let row_num = rows.len(); let mut output = MutableRecordBatch::new(row_num, Arc::new(schema.clone())); - let mut row = RowReader::new(&schema, row_type); + let mut row = RowReader::new(schema, row_type); for data in rows { row.point_to(0, data); diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs index 3c0207a863cf7..65099a79e0913 100644 --- a/datafusion/core/src/physical_plan/hash_utils.rs +++ b/datafusion/core/src/physical_plan/hash_utils.rs @@ -265,7 +265,7 @@ pub fn create_hashes<'a>( for hash in hashes_buffer.iter_mut() { *hash = 0 } - return Ok(hashes_buffer); + Ok(hashes_buffer) } /// Test version of `create_row_hashes` that produces the same value for @@ -281,7 +281,7 @@ pub fn create_row_hashes<'a>( for hash in hashes_buffer.iter_mut() { *hash = 0 } - return Ok(hashes_buffer); + Ok(hashes_buffer) } /// Test version of `create_row_hashes` that produces the same value for @@ -300,7 +300,7 @@ pub fn create_row_hashes<'a>( for (i, hash) in hashes_buffer.iter_mut().enumerate() { *hash = >::get_hash(&rows[i], random_state); } - return Ok(hashes_buffer); + Ok(hashes_buffer) } /// Creates hash values for every row, based on the values in the diff --git a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs index dc8345064dcff..d2ebf12f710c8 100644 --- a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs +++ b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Accumulator in raw format +//! Accumulator over row format use arrow::array::ArrayRef; use datafusion_common::{Result, ScalarValue}; diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index 2847502e77caf..42960c9a76b78 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -217,9 +217,11 @@ impl AccumulatorV2 for AvgAccumulatorV2 { accessor: &mut RowAccessor, ) -> Result<()> { let values = &values[0]; - + // count let delta = (values.len() - values.data().null_count()) as u64; accessor.add_u64(self.start_index, delta); + + // sum sum::add_to_row( &self.sum_datatype, self.start_index + 1, @@ -235,9 +237,11 @@ impl AccumulatorV2 for AvgAccumulatorV2 { accessor: &mut RowAccessor, ) -> Result<()> { let counts = states[0].as_any().downcast_ref::().unwrap(); + // count let delta = compute::sum(counts).unwrap_or(0); accessor.add_u64(self.start_index, delta); + // sum sum::add_to_row( &self.sum_datatype, self.start_index + 1, diff --git a/datafusion/row/src/layout.rs b/datafusion/row/src/layout.rs index adbe67ea52df9..0c92025a74f4c 100644 --- a/datafusion/row/src/layout.rs +++ b/datafusion/row/src/layout.rs @@ -57,7 +57,12 @@ pub struct RowLayout { impl RowLayout { /// new pub fn new(schema: &Schema, row_type: RowType) -> Self { - assert!(row_supported(schema, row_type)); + assert!( + row_supported(schema, row_type), + "{:?}Row with {:?} not supported yet.", + row_type, + schema, + ); let null_free = schema_null_free(schema); let field_count = schema.fields().len(); let null_width = if null_free { @@ -151,7 +156,7 @@ pub(crate) fn estimate_row_width(schema: &Schema, layout: &RowLayout) -> usize { /// Tell if we can create raw-bytes based rows since we currently /// has limited data type supports in the row format -fn row_supported(schema: &Schema, row_type: RowType) -> bool { +pub fn row_supported(schema: &Schema, row_type: RowType) -> bool { schema .fields() .iter() diff --git a/datafusion/row/src/lib.rs b/datafusion/row/src/lib.rs index c31bf751a1190..c05cbcd0ef1c4 100644 --- a/datafusion/row/src/lib.rs +++ b/datafusion/row/src/lib.rs @@ -51,6 +51,7 @@ use arrow::array::{make_builder, ArrayBuilder, ArrayRef}; use arrow::datatypes::Schema; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; +pub use layout::row_supported; pub use layout::RowType; use std::sync::Arc; @@ -350,7 +351,7 @@ mod tests { ); #[test] - #[should_panic(expected = "row_supported(schema, row_type)")] + #[should_panic(expected = "not supported yet")] fn test_unsupported_word_aligned_type() { let a: ArrayRef = Arc::new(StringArray::from(vec!["hello", "world"])); let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap(); @@ -389,7 +390,7 @@ mod tests { } #[test] - #[should_panic(expected = "row_supported(schema, row_type)")] + #[should_panic(expected = "not supported yet")] fn test_unsupported_type_write() { let a: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap(); @@ -399,7 +400,7 @@ mod tests { } #[test] - #[should_panic(expected = "row_supported(schema, row_type)")] + #[should_panic(expected = "not supported yet")] fn test_unsupported_type_read() { let schema = Arc::new(Schema::new(vec![Field::new( "a", From 1cf0ba56425e00dac5e1cb81cf1d50c783ce2c72 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Thu, 5 May 2022 19:18:23 +0800 Subject: [PATCH 6/8] make row non-optional --- datafusion/core/Cargo.toml | 9 +++------ datafusion/core/benches/aggregate_query_sql.rs | 10 ++++++++++ datafusion/core/src/lib.rs | 1 - datafusion/physical-expr/Cargo.toml | 5 ++--- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 510c385733706..e11e02e95bdf1 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -41,15 +41,13 @@ path = "src/lib.rs" # Used to enable the avro format avro = ["avro-rs", "num-traits", "datafusion-common/avro"] crypto_expressions = ["datafusion-physical-expr/crypto_expressions"] -default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"] +default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) force_hash_collisions = [] # Used to enable JIT code generation jit = ["datafusion-jit"] pyarrow = ["pyo3", "arrow/pyarrow", "datafusion-common/pyarrow"] regex_expressions = ["datafusion-physical-expr/regex_expressions"] -# Used to enable row format experiment -row = ["datafusion-row"] # Used to enable scheduler scheduler = ["rayon"] simd = ["arrow/simd"] @@ -66,7 +64,7 @@ datafusion-data-access = { path = "../../data-access", version = "1.0.0" } datafusion-expr = { path = "../expr", version = "7.0.0" } datafusion-jit = { path = "../jit", version = "7.0.0", optional = true } datafusion-physical-expr = { path = "../physical-expr", version = "7.0.0" } -datafusion-row = { path = "../row", version = "7.0.0", optional = true } +datafusion-row = { path = "../row", version = "7.0.0" } futures = "0.3" hashbrown = { version = "0.12", features = ["raw"] } lazy_static = { version = "^1.4.0" } @@ -134,8 +132,7 @@ name = "sql_planner" [[bench]] harness = false name = "jit" -required-features = ["row", "jit"] +required-features = ["jit"] [[test]] name = "row" -required-features = ["row"] diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs index 807e64ff5e273..8570f81700c50 100644 --- a/datafusion/core/benches/aggregate_query_sql.rs +++ b/datafusion/core/benches/aggregate_query_sql.rs @@ -133,6 +133,16 @@ fn criterion_benchmark(c: &mut Criterion) { ) }) }); + + c.bench_function("aggregate_query_group_by_u64_multiple_keys", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT u64_wide, utf8, MIN(f64), AVG(f64), COUNT(f64) \ + FROM t GROUP BY u64_wide, utf8", + ) + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index c598d9a33cefb..b553c0ed84b53 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -233,7 +233,6 @@ pub use datafusion_data_access; pub use datafusion_expr as logical_expr; pub use datafusion_physical_expr as physical_expr; -#[cfg(feature = "row")] pub use datafusion_row as row; #[cfg(feature = "jit")] diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index ba4d11c0be6a4..d64ecb07b7142 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -34,9 +34,8 @@ path = "src/lib.rs" [features] crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] -default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"] +default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] regex_expressions = ["regex"] -row = ["datafusion-row"] unicode_expressions = ["unicode-segmentation"] [dependencies] @@ -47,7 +46,7 @@ blake3 = { version = "1.0", optional = true } chrono = { version = "0.4", default-features = false } datafusion-common = { path = "../common", version = "7.0.0" } datafusion-expr = { path = "../expr", version = "7.0.0" } -datafusion-row = { path = "../row", version = "7.0.0", optional = true } +datafusion-row = { path = "../row", version = "7.0.0" } hashbrown = { version = "0.12", features = ["raw"] } lazy_static = { version = "^1.4.0" } md-5 = { version = "^0.10.0", optional = true } From c8b4833a2cb83b5b70c78d9082acfbc60eb3213f Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Fri, 6 May 2022 19:12:51 +0800 Subject: [PATCH 7/8] address comments, add docs, part fix #2455 --- .../core/src/physical_plan/aggregates/mod.rs | 16 ++++-- .../src/physical_plan/aggregates/row_hash.rs | 26 +++++++--- .../core/src/physical_plan/hash_utils.rs | 5 +- datafusion/core/tests/sql/aggregates.rs | 22 ++++++++ datafusion/core/tests/sql/functions.rs | 4 +- .../physical-expr/src/aggregate/average.rs | 51 +++++++++++-------- .../physical-expr/src/aggregate/count.rs | 31 ++++++----- .../physical-expr/src/aggregate/min_max.rs | 40 +++++++++------ datafusion/physical-expr/src/aggregate/mod.rs | 15 ++++-- .../{accumulator_v2.rs => row_accumulator.rs} | 25 ++++++++- datafusion/physical-expr/src/aggregate/sum.rs | 40 ++++++++++----- datafusion/row/src/accessor.rs | 7 +-- 12 files changed, 193 insertions(+), 89 deletions(-) rename datafusion/physical-expr/src/aggregate/{accumulator_v2.rs => row_accumulator.rs} (54%) diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index 8e6f0c4c1b44d..abe20cdcbc94e 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -45,7 +45,7 @@ mod row_hash; use crate::physical_plan::aggregates::row_hash::GroupedHashAggregateStreamV2; pub use datafusion_expr::AggregateFunction; -use datafusion_physical_expr::aggregate::accumulator_v2::AccumulatorV2; +use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator; pub use datafusion_physical_expr::expressions::create_aggregate_expr; use datafusion_row::{row_supported, RowType}; @@ -387,7 +387,7 @@ fn merge_expressions( } pub(crate) type AccumulatorItem = Box; -pub(crate) type AccumulatorItemV2 = Box; +pub(crate) type AccumulatorItemV2 = Box; fn create_accumulators( aggr_expr: &[Arc], @@ -399,16 +399,22 @@ fn create_accumulators( } fn accumulator_v2_supported(aggr_expr: &[Arc]) -> bool { - aggr_expr.iter().all(|expr| expr.accumulator_v2_supported()) + aggr_expr + .iter() + .all(|expr| expr.row_accumulator_supported()) } fn create_accumulators_v2( aggr_expr: &[Arc], ) -> datafusion_common::Result> { + let mut state_index = 0; aggr_expr .iter() - .enumerate() - .map(|(idx, expr)| expr.create_accumulator_v2(idx)) + .map(|expr| { + let result = expr.create_row_accumulator(state_index); + state_index += expr.state_fields().unwrap().len(); + result + }) .collect::>>() } diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index 0ee46c880914b..eac1590dbb110 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -52,10 +52,22 @@ use datafusion_row::writer::{write_row, RowWriter}; use datafusion_row::{MutableRecordBatch, RowType}; use hashbrown::raw::RawTable; -/// Grouping aggregate with row format to store the aggregation state. +/// Grouping aggregate with row-format aggregation states inside. /// -/// The Architecture is similar to that in [`super::GroupedHashAggregateStream`] but use -/// row format inside the HashTable to store aggregation buffers. +/// For each aggregation entry, we use: +/// - [Compact] row represents grouping keys for fast hash computation and comparison directly on raw bytes. +/// - [WordAligned] row to store aggregation state, designed to be CPU-friendly when updates over every field are often. +/// +/// The architecture is the following: +/// +/// 1. For each input RecordBatch, update aggregation states corresponding to all appeared grouping keys. +/// 2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row +/// 3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch. +/// 4. The state's RecordBatch is `merge`d to a new state +/// 5. The state is mapped to the final value +/// +/// [Compact]: datafusion_row::layout::RowType::Compact +/// [WordAligned]: datafusion_row::layout::RowType::WordAligned pub(crate) struct GroupedHashAggregateStreamV2 { schema: SchemaRef, input: SendableRecordBatchStream, @@ -68,7 +80,7 @@ pub(crate) struct GroupedHashAggregateStreamV2 { group_schema: SchemaRef, aggr_schema: SchemaRef, - aggr_layout: RowLayout, + aggr_layout: Arc, baseline_metrics: BaselineMetrics, random_state: RandomState, @@ -106,7 +118,7 @@ impl GroupedHashAggregateStreamV2 { let group_schema = group_schema(&schema, group_expr.len()); let aggr_schema = aggr_state_schema(&aggr_expr)?; - let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned); + let aggr_layout = Arc::new(RowLayout::new(&aggr_schema, RowType::WordAligned)); timer.done(); Ok(Self { @@ -151,7 +163,7 @@ impl Stream for GroupedHashAggregateStreamV2 { &this.group_expr, &mut this.accumulators, &this.group_schema, - &this.aggr_layout, + this.aggr_layout.clone(), batch, &mut this.aggr_state, &this.aggregate_expressions, @@ -203,7 +215,7 @@ fn group_aggregate_batch( group_expr: &[Arc], accumulators: &mut [AccumulatorItemV2], group_schema: &Schema, - state_layout: &RowLayout, + state_layout: Arc, batch: RecordBatch, aggr_state: &mut AggregationState, aggregate_expressions: &[Vec>], diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs index 65099a79e0913..e68623be93c59 100644 --- a/datafusion/core/src/physical_plan/hash_utils.rs +++ b/datafusion/core/src/physical_plan/hash_utils.rs @@ -284,10 +284,7 @@ pub fn create_row_hashes<'a>( Ok(hashes_buffer) } -/// Test version of `create_row_hashes` that produces the same value for -/// all hashes (to test collisions) -/// -/// See comments on `hashes_buffer` for more details +/// Creates hash values for every row, based on their raw bytes. #[cfg(not(feature = "force_hash_collisions"))] pub fn create_row_hashes<'a>( rows: &[Vec], diff --git a/datafusion/core/tests/sql/aggregates.rs b/datafusion/core/tests/sql/aggregates.rs index b488e880dcf83..d8ec9e0167e60 100644 --- a/datafusion/core/tests/sql/aggregates.rs +++ b/datafusion/core/tests/sql/aggregates.rs @@ -652,6 +652,28 @@ async fn csv_query_array_agg_one() -> Result<()> { Ok(()) } +#[tokio::test] +async fn csv_query_array_agg_with_overflow() -> Result<()> { + let ctx = SessionContext::new(); + register_aggregate_csv(&ctx).await?; + let sql = + "select c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by c2 order by c2"; + let actual = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+----+--------+---------------------+--------+--------+----------+", + "| c2 | sum_c3 | avg_c3 | max_c3 | min_c3 | count_c3 |", + "+----+--------+---------------------+--------+--------+----------+", + "| 1 | 367 | 16.681818181818183 | 125 | -99 | 22 |", + "| 2 | 184 | 8.363636363636363 | 122 | -117 | 22 |", + "| 3 | 395 | 20.789473684210527 | 123 | -101 | 19 |", + "| 4 | 29 | 1.2608695652173914 | 123 | -117 | 23 |", + "| 5 | -194 | -13.857142857142858 | 118 | -101 | 14 |", + "+----+--------+---------------------+--------+--------+----------+", + ]; + assert_batches_eq!(expected, &actual); + Ok(()) +} + #[tokio::test] async fn csv_query_array_agg_distinct() -> Result<()> { let ctx = SessionContext::new(); diff --git a/datafusion/core/tests/sql/functions.rs b/datafusion/core/tests/sql/functions.rs index 857781aa35a3c..59236c467fd64 100644 --- a/datafusion/core/tests/sql/functions.rs +++ b/datafusion/core/tests/sql/functions.rs @@ -17,7 +17,6 @@ use super::*; -/// sqrt(f32) is slightly different than sqrt(CAST(f32 AS double))) #[tokio::test] async fn sqrt_f32_vs_f64() -> Result<()> { let ctx = create_ctx()?; @@ -25,7 +24,8 @@ async fn sqrt_f32_vs_f64() -> Result<()> { // sqrt(f32)'s plan passes let sql = "SELECT avg(sqrt(c11)) FROM aggregate_test_100"; let actual = execute(&ctx, sql).await; - let expected = vec![vec!["0.6584407806396484"]]; + let sql = "SELECT avg(CAST(sqrt(c11) AS double)) FROM aggregate_test_100"; + let expected = execute(&ctx, sql).await; assert_eq!(actual, expected); let sql = "SELECT avg(sqrt(CAST(c11 AS double))) FROM aggregate_test_100"; diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index 42960c9a76b78..3eee84bb5f508 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; -use crate::aggregate::accumulator_v2::AccumulatorV2; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::aggregate::sum; use crate::expressions::format_state_name; use crate::{AggregateExpr, PhysicalExpr}; @@ -104,7 +104,7 @@ impl AggregateExpr for Avg { &self.name } - fn accumulator_v2_supported(&self) -> bool { + fn row_accumulator_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -120,11 +120,11 @@ impl AggregateExpr for Avg { ) } - fn create_accumulator_v2( + fn create_row_accumulator( &self, start_index: usize, - ) -> Result> { - Ok(Box::new(AvgAccumulatorV2::new( + ) -> Result> { + Ok(Box::new(AvgRowAccumulator::new( start_index, self.data_type.clone(), ))) @@ -158,7 +158,10 @@ impl Accumulator for AvgAccumulator { let values = &values[0]; self.count += (values.len() - values.data().null_count()) as u64; - self.sum = sum::sum(&self.sum, &sum::sum_batch(values)?)?; + self.sum = sum::sum( + &self.sum, + &sum::sum_batch(values, &self.sum.get_datatype())?, + )?; Ok(()) } @@ -168,7 +171,10 @@ impl Accumulator for AvgAccumulator { self.count += compute::sum(counts).unwrap_or(0); // sums are summed - self.sum = sum::sum(&self.sum, &sum::sum_batch(&states[1])?)?; + self.sum = sum::sum( + &self.sum, + &sum::sum_batch(&states[1], &self.sum.get_datatype())?, + )?; Ok(()) } @@ -196,21 +202,21 @@ impl Accumulator for AvgAccumulator { } #[derive(Debug)] -struct AvgAccumulatorV2 { - start_index: usize, +struct AvgRowAccumulator { + state_index: usize, sum_datatype: DataType, } -impl AvgAccumulatorV2 { +impl AvgRowAccumulator { pub fn new(start_index: usize, sum_datatype: DataType) -> Self { Self { - start_index, + state_index: start_index, sum_datatype, } } } -impl AccumulatorV2 for AvgAccumulatorV2 { +impl RowAccumulator for AvgRowAccumulator { fn update_batch( &mut self, values: &[ArrayRef], @@ -219,14 +225,14 @@ impl AccumulatorV2 for AvgAccumulatorV2 { let values = &values[0]; // count let delta = (values.len() - values.data().null_count()) as u64; - accessor.add_u64(self.start_index, delta); + accessor.add_u64(self.state_index(), delta); // sum sum::add_to_row( &self.sum_datatype, - self.start_index + 1, + self.state_index() + 1, accessor, - &sum::sum_batch(values)?, + &sum::sum_batch(values, &self.sum_datatype)?, )?; Ok(()) } @@ -239,30 +245,35 @@ impl AccumulatorV2 for AvgAccumulatorV2 { let counts = states[0].as_any().downcast_ref::().unwrap(); // count let delta = compute::sum(counts).unwrap_or(0); - accessor.add_u64(self.start_index, delta); + accessor.add_u64(self.state_index(), delta); // sum sum::add_to_row( &self.sum_datatype, - self.start_index + 1, + self.state_index() + 1, accessor, - &sum::sum_batch(&states[1])?, + &sum::sum_batch(&states[1], &self.sum_datatype)?, )?; Ok(()) } fn evaluate(&self, accessor: &RowAccessor) -> Result { assert_eq!(self.sum_datatype, DataType::Float64); - Ok(match accessor.get_u64_opt(self.start_index) { + Ok(match accessor.get_u64_opt(self.state_index()) { None => ScalarValue::Float64(None), Some(0) => ScalarValue::Float64(Some(0.0)), Some(n) => ScalarValue::Float64( accessor - .get_f64_opt(self.start_index + 1) + .get_f64_opt(self.state_index() + 1) .map(|f| f / n as f64), ), }) } + + #[inline(always)] + fn state_index(&self) -> usize { + self.state_index + } } #[cfg(test)] diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs index 9ccd13d5753ed..54bec05d72f0a 100644 --- a/datafusion/physical-expr/src/aggregate/count.rs +++ b/datafusion/physical-expr/src/aggregate/count.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -use crate::aggregate::accumulator_v2::AccumulatorV2; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::{AggregateExpr, PhysicalExpr}; use arrow::compute; use arrow::datatypes::DataType; @@ -96,15 +96,15 @@ impl AggregateExpr for Count { &self.name } - fn accumulator_v2_supported(&self) -> bool { + fn row_accumulator_supported(&self) -> bool { true } - fn create_accumulator_v2( + fn create_row_accumulator( &self, start_index: usize, - ) -> Result> { - Ok(Box::new(CountAccumulatorV2::new(start_index))) + ) -> Result> { + Ok(Box::new(CountRowAccumulator::new(start_index))) } } @@ -146,17 +146,17 @@ impl Accumulator for CountAccumulator { } #[derive(Debug)] -struct CountAccumulatorV2 { - index: usize, +struct CountRowAccumulator { + state_index: usize, } -impl CountAccumulatorV2 { +impl CountRowAccumulator { pub fn new(index: usize) -> Self { - Self { index } + Self { state_index: index } } } -impl AccumulatorV2 for CountAccumulatorV2 { +impl RowAccumulator for CountRowAccumulator { fn update_batch( &mut self, values: &[ArrayRef], @@ -164,7 +164,7 @@ impl AccumulatorV2 for CountAccumulatorV2 { ) -> Result<()> { let array = &values[0]; let delta = (array.len() - array.data().null_count()) as u64; - accessor.add_u64(self.index, delta); + accessor.add_u64(self.state_index, delta); Ok(()) } @@ -176,13 +176,18 @@ impl AccumulatorV2 for CountAccumulatorV2 { let counts = states[0].as_any().downcast_ref::().unwrap(); let delta = &compute::sum(counts); if let Some(d) = delta { - accessor.add_u64(self.index, *d); + accessor.add_u64(self.state_index, *d); } Ok(()) } fn evaluate(&self, accessor: &RowAccessor) -> Result { - Ok(accessor.get_as_scalar(&DataType::UInt64, self.index)) + Ok(accessor.get_as_scalar(&DataType::UInt64, self.state_index)) + } + + #[inline(always)] + fn state_index(&self) -> usize { + self.state_index } } diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs index 2062d16c71245..dd2f44b22c075 100644 --- a/datafusion/physical-expr/src/aggregate/min_max.rs +++ b/datafusion/physical-expr/src/aggregate/min_max.rs @@ -37,7 +37,7 @@ use datafusion_common::ScalarValue; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::Accumulator; -use crate::aggregate::accumulator_v2::AccumulatorV2; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::expressions::format_state_name; use arrow::array::Array; use arrow::array::DecimalArray; @@ -114,7 +114,7 @@ impl AggregateExpr for Max { &self.name } - fn accumulator_v2_supported(&self) -> bool { + fn row_accumulator_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -130,11 +130,11 @@ impl AggregateExpr for Max { ) } - fn create_accumulator_v2( + fn create_row_accumulator( &self, start_index: usize, - ) -> Result> { - Ok(Box::new(MaxAccumulatorV2::new( + ) -> Result> { + Ok(Box::new(MaxRowAccumulator::new( start_index, self.data_type.clone(), ))) @@ -547,18 +547,18 @@ impl Accumulator for MaxAccumulator { } #[derive(Debug)] -struct MaxAccumulatorV2 { +struct MaxRowAccumulator { index: usize, data_type: DataType, } -impl MaxAccumulatorV2 { +impl MaxRowAccumulator { pub fn new(index: usize, data_type: DataType) -> Self { Self { index, data_type } } } -impl AccumulatorV2 for MaxAccumulatorV2 { +impl RowAccumulator for MaxRowAccumulator { fn update_batch( &mut self, values: &[ArrayRef], @@ -581,6 +581,11 @@ impl AccumulatorV2 for MaxAccumulatorV2 { fn evaluate(&self, accessor: &RowAccessor) -> Result { Ok(accessor.get_as_scalar(&self.data_type, self.index)) } + + #[inline(always)] + fn state_index(&self) -> usize { + self.index + } } /// MIN aggregate expression @@ -642,7 +647,7 @@ impl AggregateExpr for Min { &self.name } - fn accumulator_v2_supported(&self) -> bool { + fn row_accumulator_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -658,11 +663,11 @@ impl AggregateExpr for Min { ) } - fn create_accumulator_v2( + fn create_row_accumulator( &self, start_index: usize, - ) -> Result> { - Ok(Box::new(MinAccumulatorV2::new( + ) -> Result> { + Ok(Box::new(MinRowAccumulator::new( start_index, self.data_type.clone(), ))) @@ -706,18 +711,18 @@ impl Accumulator for MinAccumulator { } #[derive(Debug)] -struct MinAccumulatorV2 { +struct MinRowAccumulator { index: usize, data_type: DataType, } -impl MinAccumulatorV2 { +impl MinRowAccumulator { pub fn new(index: usize, data_type: DataType) -> Self { Self { index, data_type } } } -impl AccumulatorV2 for MinAccumulatorV2 { +impl RowAccumulator for MinRowAccumulator { fn update_batch( &mut self, values: &[ArrayRef], @@ -740,6 +745,11 @@ impl AccumulatorV2 for MinAccumulatorV2 { fn evaluate(&self, accessor: &RowAccessor) -> Result { Ok(accessor.get_as_scalar(&self.data_type, self.index)) } + + #[inline(always)] + fn state_index(&self) -> usize { + self.index + } } #[cfg(test)] diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index b9ad768e8eca6..09e8a9b0ac89f 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aggregate::accumulator_v2::AccumulatorV2; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::PhysicalExpr; use arrow::datatypes::Field; use datafusion_common::{Result, ScalarValue}; @@ -38,9 +38,9 @@ pub(crate) mod count_distinct; pub(crate) mod covariance; #[macro_use] pub(crate) mod min_max; -pub mod accumulator_v2; pub mod build_in; mod hyperloglog; +pub mod row_accumulator; pub(crate) mod stats; pub(crate) mod stddev; pub(crate) mod sum; @@ -81,14 +81,19 @@ pub trait AggregateExpr: Send + Sync + Debug { } /// If the aggregate expression is supported by row format - fn accumulator_v2_supported(&self) -> bool { + fn row_accumulator_supported(&self) -> bool { false } - fn create_accumulator_v2( + /// RowAccumulator to access/update row-based aggregation state in-place. + /// Currently, row accumulator only supports states of fixed-sized type. + /// + /// We recommend implementing `RowAccumulator` along with the standard `Accumulator`, + /// when its state is of fixed size, as RowAccumulator is more memory efficient and CPU-friendly. + fn create_row_accumulator( &self, _start_index: usize, - ) -> Result> { + ) -> Result> { unreachable!() } } diff --git a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs b/datafusion/physical-expr/src/aggregate/row_accumulator.rs similarity index 54% rename from datafusion/physical-expr/src/aggregate/accumulator_v2.rs rename to datafusion/physical-expr/src/aggregate/row_accumulator.rs index d2ebf12f710c8..386787454f853 100644 --- a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs +++ b/datafusion/physical-expr/src/aggregate/row_accumulator.rs @@ -22,7 +22,27 @@ use datafusion_common::{Result, ScalarValue}; use datafusion_row::accessor::RowAccessor; use std::fmt::Debug; -pub trait AccumulatorV2: Send + Sync + Debug { +/// Row-based accumulator where the internal aggregate state(s) are stored using row format. +/// +/// Unlike the [`datafusion_expr::Accumulator`], the [`RowAccumulator`] does not store the state internally. +/// Instead, it knows how to access/update the state stored in a row via the the provided accessor and +/// its state's starting field index in the row. +/// +/// For example, we are evaluating `SELECT a, sum(b), avg(c), count(d) from GROUP BY a;`, we would have one row used as +/// aggregation state for each distinct `a` value, the index of the first and the only state of `sum(b)` would be 0, +/// the index of the first state of `avg(c)` would be 1, and the index of the first and only state of `cound(d)` would be 3: +/// +/// sum(b) state_index = 0 count(d) state_index = 3 +/// | | +/// v v +/// +--------+----------+--------+----------+ +/// | sum(b) | count(c) | sum(c) | count(d) | +/// +--------+----------+--------+----------+ +/// ^ +/// | +/// avg(c) state_index = 1 +/// +pub trait RowAccumulator: Send + Sync + Debug { /// updates the accumulator's state from a vector of arrays. fn update_batch( &mut self, @@ -39,4 +59,7 @@ pub trait AccumulatorV2: Send + Sync + Debug { /// returns its value based on its current state. fn evaluate(&self, accessor: &RowAccessor) -> Result; + + /// State's starting field index in the row. + fn state_index(&self) -> usize; } diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs index b1928c7031531..c369e7af00813 100644 --- a/datafusion/physical-expr/src/aggregate/sum.rs +++ b/datafusion/physical-expr/src/aggregate/sum.rs @@ -34,10 +34,11 @@ use arrow::{ use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::Accumulator; -use crate::aggregate::accumulator_v2::AccumulatorV2; +use crate::aggregate::row_accumulator::RowAccumulator; use crate::expressions::format_state_name; use arrow::array::Array; use arrow::array::DecimalArray; +use arrow::compute::cast; use datafusion_row::accessor::RowAccessor; /// SUM aggregate expression @@ -99,7 +100,7 @@ impl AggregateExpr for Sum { &self.name } - fn accumulator_v2_supported(&self) -> bool { + fn row_accumulator_supported(&self) -> bool { matches!( self.data_type, DataType::UInt8 @@ -115,11 +116,11 @@ impl AggregateExpr for Sum { ) } - fn create_accumulator_v2( + fn create_row_accumulator( &self, start_index: usize, - ) -> Result> { - Ok(Box::new(SumAccumulatorV2::new( + ) -> Result> { + Ok(Box::new(SumRowAccumulator::new( start_index, self.data_type.clone(), ))) @@ -172,7 +173,8 @@ fn sum_decimal_batch( } // sums the array and returns a ScalarValue of its corresponding type. -pub(crate) fn sum_batch(values: &ArrayRef) -> Result { +pub(crate) fn sum_batch(values: &ArrayRef, sum_type: &DataType) -> Result { + let values = &cast(values, sum_type)?; Ok(match values.data_type() { DataType::Decimal(precision, scale) => { sum_decimal_batch(values, precision, scale)? @@ -439,7 +441,7 @@ impl Accumulator for SumAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let values = &values[0]; - self.sum = sum(&self.sum, &sum_batch(values)?)?; + self.sum = sum(&self.sum, &sum_batch(values, &self.sum.get_datatype())?)?; Ok(()) } @@ -456,25 +458,30 @@ impl Accumulator for SumAccumulator { } #[derive(Debug)] -struct SumAccumulatorV2 { +struct SumRowAccumulator { index: usize, datatype: DataType, } -impl SumAccumulatorV2 { +impl SumRowAccumulator { pub fn new(index: usize, datatype: DataType) -> Self { Self { index, datatype } } } -impl AccumulatorV2 for SumAccumulatorV2 { +impl RowAccumulator for SumRowAccumulator { fn update_batch( &mut self, values: &[ArrayRef], accessor: &mut RowAccessor, ) -> Result<()> { let values = &values[0]; - add_to_row(&self.datatype, self.index, accessor, &sum_batch(values)?)?; + add_to_row( + &self.datatype, + self.index, + accessor, + &sum_batch(values, &self.datatype)?, + )?; Ok(()) } @@ -489,6 +496,11 @@ impl AccumulatorV2 for SumAccumulatorV2 { fn evaluate(&self, accessor: &RowAccessor) -> Result { Ok(accessor.get_as_scalar(&self.datatype, self.index)) } + + #[inline(always)] + fn state_index(&self) -> usize { + self.index + } } #[cfg(test)] @@ -532,7 +544,7 @@ mod tests { .collect::() .with_precision_and_scale(10, 0)?, ); - let result = sum_batch(&array)?; + let result = sum_batch(&array, &DataType::Decimal(10, 0))?; assert_eq!(ScalarValue::Decimal128(Some(15), 10, 0), result); // test agg @@ -567,7 +579,7 @@ mod tests { .collect::() .with_precision_and_scale(10, 0)?, ); - let result = sum_batch(&array)?; + let result = sum_batch(&array, &DataType::Decimal(10, 0))?; assert_eq!(ScalarValue::Decimal128(Some(13), 10, 0), result); // test agg @@ -601,7 +613,7 @@ mod tests { .collect::() .with_precision_and_scale(10, 0)?, ); - let result = sum_batch(&array)?; + let result = sum_batch(&array, &DataType::Decimal(10, 0))?; assert_eq!(ScalarValue::Decimal128(None, 10, 0), result); // test agg diff --git a/datafusion/row/src/accessor.rs b/datafusion/row/src/accessor.rs index ae5c74b701352..b6ec41d3345bb 100644 --- a/datafusion/row/src/accessor.rs +++ b/datafusion/row/src/accessor.rs @@ -23,13 +23,14 @@ use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx}; use arrow::datatypes::{DataType, Schema}; use arrow::util::bit_util::{get_bit_raw, set_bit_raw}; use datafusion_common::ScalarValue; +use std::sync::Arc; //TODO: DRY with reader and writer /// Read the tuple `data[base_offset..]` we are currently pointing to pub struct RowAccessor<'a> { /// Layout on how to read each field - layout: RowLayout, + layout: Arc, /// Raw bytes slice where the tuple stores data: &'a mut [u8], /// Start position for the current tuple in the raw bytes slice. @@ -103,13 +104,13 @@ impl<'a> RowAccessor<'a> { /// new pub fn new(schema: &Schema, row_type: RowType) -> Self { Self { - layout: RowLayout::new(schema, row_type), + layout: Arc::new(RowLayout::new(schema, row_type)), data: &mut [], base_offset: 0, } } - pub fn new_from_layout(layout: RowLayout) -> Self { + pub fn new_from_layout(layout: Arc) -> Self { Self { layout, data: &mut [], From 7350cebee1d23c6485f0224db61752c68136ad57 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Fri, 6 May 2022 19:24:35 +0800 Subject: [PATCH 8/8] Apply suggestions from code review Co-authored-by: Andrew Lamb --- datafusion/core/src/physical_plan/aggregates/row_hash.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index eac1590dbb110..e364048e75fda 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -395,9 +395,9 @@ impl std::fmt::Debug for AggregationState { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { // hashes are not store inline, so could only get values let map_string = "RawTable"; - f.debug_struct("RowAccumulators") + f.debug_struct("AggregationState") .field("map", &map_string) - .field("row_group_states", &self.group_states) + .field("group_states", &self.group_states) .finish() } }