From 475f166340ee5e67b2f133307aac98d339180841 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 26 Apr 2022 12:07:17 +0800
Subject: [PATCH 1/8] first move: re-group aggregates functionalities in
 core/physical_p/aggregates

---
 .../rust/core/src/serde/physical_plan/mod.rs  |   12 +-
 ballista/rust/core/src/utils.rs               |    4 +-
 ballista/rust/scheduler/src/planner.rs        |   12 +-
 .../aggregate_statistics.rs                   |   43 +-
 .../src/physical_optimizer/repartition.rs     |    6 +-
 .../core/src/physical_plan/aggregates/hash.rs |  477 ++++++
 .../core/src/physical_plan/aggregates/mod.rs  |  719 +++++++++
 .../physical_plan/aggregates/no_grouping.rs   |  165 +++
 .../core/src/physical_plan/hash_aggregate.rs  | 1299 -----------------
 datafusion/core/src/physical_plan/mod.rs      |    1 -
 datafusion/core/src/physical_plan/planner.rs  |    8 +-
 datafusion/core/tests/sql/explain_analyze.rs  |    2 +-
 12 files changed, 1403 insertions(+), 1345 deletions(-)
 create mode 100644 datafusion/core/src/physical_plan/aggregates/hash.rs
 create mode 100644 datafusion/core/src/physical_plan/aggregates/no_grouping.rs
 delete mode 100644 datafusion/core/src/physical_plan/hash_aggregate.rs
diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs
index ed268820f3947..81e305e2ade99 100644
--- a/ballista/rust/core/src/serde/physical_plan/mod.rs
+++ b/ballista/rust/core/src/serde/physical_plan/mod.rs
@@ -28,7 +28,8 @@ use datafusion::datasource::listing::PartitionedFile;
 use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::logical_plan::window_frames::WindowFrame;
 use datafusion::logical_plan::FunctionRegistry;
-use datafusion::physical_plan::aggregates::create_aggregate_expr;
+use datafusion::physical_plan::aggregates::AggregateExec;
+use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateMode};
 use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion::physical_plan::cross_join::CrossJoinExec;
@@ -39,7 +40,6 @@ use datafusion::physical_plan::file_format::{
     AvroExec, CsvExec, FileScanConfig, ParquetExec,
 };
 use datafusion::physical_plan::filter::FilterExec;
-use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
 use datafusion::physical_plan::hash_join::{HashJoinExec, PartitionMode};
 use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion::physical_plan::projection::ProjectionExec;
@@ -391,7 +391,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                     })
                     .collect::<Result<Vec<_>, _>>()?;
 
-                Ok(Arc::new(HashAggregateExec::try_new(
+                Ok(Arc::new(AggregateExec::try_new(
                     agg_mode,
                     group,
                     physical_aggr_expr,
@@ -730,7 +730,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
                     },
                 ))),
             })
-        } else if let Some(exec) = plan.downcast_ref::<HashAggregateExec>() {
+        } else if let Some(exec) = plan.downcast_ref::<AggregateExec>() {
             let groups = exec
                 .group_expr()
                 .iter()
@@ -1080,12 +1080,12 @@ mod roundtrip_tests {
         datasource::listing::PartitionedFile,
         logical_plan::{JoinType, Operator},
         physical_plan::{
+            aggregates::{AggregateExec, AggregateMode},
             empty::EmptyExec,
             expressions::{binary, col, lit, InListExpr, NotExpr},
             expressions::{Avg, Column, PhysicalSortExpr},
             file_format::{FileScanConfig, ParquetExec},
             filter::FilterExec,
-            hash_aggregate::{AggregateMode, HashAggregateExec},
             hash_join::{HashJoinExec, PartitionMode},
             limit::{GlobalLimitExec, LocalLimitExec},
             sorts::sort::SortExec,
@@ -1226,7 +1226,7 @@ mod roundtrip_tests {
             DataType::Float64,
         ))];
 
-        roundtrip_test(Arc::new(HashAggregateExec::try_new(
+        roundtrip_test(Arc::new(AggregateExec::try_new(
             AggregateMode::Final,
             groups.clone(),
             aggregates.clone(),
diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs
index 6670ab5cedd83..310f925936b3a 100644
--- a/ballista/rust/core/src/utils.rs
+++ b/ballista/rust/core/src/utils.rs
@@ -48,9 +48,9 @@ use datafusion::physical_plan::common::batch_byte_size;
 use datafusion::physical_plan::empty::EmptyExec;
 
 use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv};
+use datafusion::physical_plan::aggregates::AggregateExec;
 use datafusion::physical_plan::file_format::{CsvExec, ParquetExec};
 use datafusion::physical_plan::filter::FilterExec;
-use datafusion::physical_plan::hash_aggregate::HashAggregateExec;
 use datafusion::physical_plan::hash_join::HashJoinExec;
 use datafusion::physical_plan::projection::ProjectionExec;
 use datafusion::physical_plan::sorts::sort::SortExec;
@@ -151,7 +151,7 @@ fn build_exec_plan_diagram(
     id: &mut AtomicUsize,
     draw_entity: bool,
 ) -> Result<usize> {
-    let operator_str = if plan.as_any().downcast_ref::<HashAggregateExec>().is_some() {
+    let operator_str = if plan.as_any().downcast_ref::<AggregateExec>().is_some() {
         "HashAggregateExec"
     } else if plan.as_any().downcast_ref::<SortExec>().is_some() {
         "SortExec"
diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs
index 8198c4ed27c0d..0d2de5089ae30 100644
--- a/ballista/rust/scheduler/src/planner.rs
+++ b/ballista/rust/scheduler/src/planner.rs
@@ -276,8 +276,8 @@ mod test {
     use ballista_core::error::BallistaError;
     use ballista_core::execution_plans::UnresolvedShuffleExec;
     use ballista_core::serde::{protobuf, AsExecutionPlan, BallistaCodec};
+    use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode};
     use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
-    use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
     use datafusion::physical_plan::hash_join::HashJoinExec;
     use datafusion::physical_plan::sorts::sort::SortExec;
     use datafusion::physical_plan::{
@@ -346,14 +346,14 @@ mod test {
 
         // verify stage 0
         let stage0 = stages[0].children()[0].clone();
-        let partial_hash = downcast_exec!(stage0, HashAggregateExec);
+        let partial_hash = downcast_exec!(stage0, AggregateExec);
         assert!(*partial_hash.mode() == AggregateMode::Partial);
 
         // verify stage 1
         let stage1 = stages[1].children()[0].clone();
         let projection = downcast_exec!(stage1, ProjectionExec);
         let final_hash = projection.children()[0].clone();
-        let final_hash = downcast_exec!(final_hash, HashAggregateExec);
+        let final_hash = downcast_exec!(final_hash, AggregateExec);
         assert!(*final_hash.mode() == AggregateMode::FinalPartitioned);
         let coalesce = final_hash.children()[0].clone();
         let coalesce = downcast_exec!(coalesce, CoalesceBatchesExec);
@@ -514,7 +514,7 @@ order by
                 .partition_count()
         );
 
-        let hash_agg = downcast_exec!(input, HashAggregateExec);
+        let hash_agg = downcast_exec!(input, AggregateExec);
 
         let coalesce_batches = hash_agg.children()[0].clone();
         let coalesce_batches = downcast_exec!(coalesce_batches, CoalesceBatchesExec);
@@ -586,8 +586,8 @@ order by
         let partial_hash = stages[0].children()[0].clone();
         let partial_hash_serde = roundtrip_operator(partial_hash.clone())?;
 
-        let partial_hash = downcast_exec!(partial_hash, HashAggregateExec);
-        let partial_hash_serde = downcast_exec!(partial_hash_serde, HashAggregateExec);
+        let partial_hash = downcast_exec!(partial_hash, AggregateExec);
+        let partial_hash_serde = downcast_exec!(partial_hash_serde, AggregateExec);
 
         assert_eq!(
             format!("{:?}", partial_hash),
diff --git a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs
index 9af053f934fb9..f8004516738d5 100644
--- a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs
+++ b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs
@@ -21,8 +21,8 @@ use std::sync::Arc;
 use arrow::datatypes::Schema;
 
 use crate::execution::context::SessionConfig;
+use crate::physical_plan::aggregates::{AggregateExec, AggregateMode};
 use crate::physical_plan::empty::EmptyExec;
-use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
 use crate::physical_plan::projection::ProjectionExec;
 use crate::physical_plan::{
     expressions, AggregateExpr, ColumnStatistics, ExecutionPlan, Statistics,
@@ -53,7 +53,7 @@ impl PhysicalOptimizerRule for AggregateStatistics {
         if let Some(partial_agg_exec) = take_optimizable(&*plan) {
             let partial_agg_exec = partial_agg_exec
                 .as_any()
-                .downcast_ref::<HashAggregateExec>()
+                .downcast_ref::<AggregateExec>()
                 .expect("take_optimizable() ensures that this is a HashAggregateExec");
             let stats = partial_agg_exec.input().statistics();
             let mut projections = vec![];
@@ -104,14 +104,14 @@ impl PhysicalOptimizerRule for AggregateStatistics {
 /// We would have prefered to return a casted ref to HashAggregateExec but the recursion requires
 /// the `ExecutionPlan.children()` method that returns an owned reference.
 fn take_optimizable(node: &dyn ExecutionPlan) -> Option<Arc<dyn ExecutionPlan>> {
-    if let Some(final_agg_exec) = node.as_any().downcast_ref::<HashAggregateExec>() {
+    if let Some(final_agg_exec) = node.as_any().downcast_ref::<AggregateExec>() {
         if final_agg_exec.mode() == &AggregateMode::Final
             && final_agg_exec.group_expr().is_empty()
         {
             let mut child = Arc::clone(final_agg_exec.input());
             loop {
                 if let Some(partial_agg_exec) =
-                    child.as_any().downcast_ref::<HashAggregateExec>()
+                    child.as_any().downcast_ref::<AggregateExec>()
                 {
                     if partial_agg_exec.mode() == &AggregateMode::Partial
                         && partial_agg_exec.group_expr().is_empty()
@@ -260,11 +260,11 @@ mod tests {
 
     use crate::error::Result;
     use crate::logical_plan::Operator;
+    use crate::physical_plan::aggregates::AggregateExec;
     use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec;
     use crate::physical_plan::common;
     use crate::physical_plan::expressions::Count;
     use crate::physical_plan::filter::FilterExec;
-    use crate::physical_plan::hash_aggregate::HashAggregateExec;
     use crate::physical_plan::memory::MemoryExec;
     use crate::prelude::SessionContext;
 
@@ -291,10 +291,7 @@ mod tests {
     }
 
     /// Checks that the count optimization was applied and we still get the right result
-    async fn assert_count_optim_success(
-        plan: HashAggregateExec,
-        nulls: bool,
-    ) -> Result<()> {
+    async fn assert_count_optim_success(plan: AggregateExec, nulls: bool) -> Result<()> {
         let session_ctx = SessionContext::new();
         let task_ctx = session_ctx.task_ctx();
         let conf = session_ctx.copied_config();
@@ -336,7 +333,7 @@ mod tests {
         let source = mock_data()?;
         let schema = source.schema();
 
-        let partial_agg = HashAggregateExec::try_new(
+        let partial_agg = AggregateExec::try_new(
             AggregateMode::Partial,
             vec![],
             vec![count_expr(None, None)],
@@ -344,7 +341,7 @@ mod tests {
             Arc::clone(&schema),
         )?;
 
-        let final_agg = HashAggregateExec::try_new(
+        let final_agg = AggregateExec::try_new(
             AggregateMode::Final,
             vec![],
             vec![count_expr(None, None)],
@@ -363,7 +360,7 @@ mod tests {
         let source = mock_data()?;
         let schema = source.schema();
 
-        let partial_agg = HashAggregateExec::try_new(
+        let partial_agg = AggregateExec::try_new(
             AggregateMode::Partial,
             vec![],
             vec![count_expr(Some(&schema), Some("a"))],
@@ -371,7 +368,7 @@ mod tests {
             Arc::clone(&schema),
         )?;
 
-        let final_agg = HashAggregateExec::try_new(
+        let final_agg = AggregateExec::try_new(
             AggregateMode::Final,
             vec![],
             vec![count_expr(Some(&schema), Some("a"))],
@@ -389,7 +386,7 @@ mod tests {
         let source = mock_data()?;
         let schema = source.schema();
 
-        let partial_agg = HashAggregateExec::try_new(
+        let partial_agg = AggregateExec::try_new(
             AggregateMode::Partial,
             vec![],
             vec![count_expr(None, None)],
@@ -400,7 +397,7 @@ mod tests {
         // We introduce an intermediate optimization step between the partial and final aggregtator
         let coalesce = CoalescePartitionsExec::new(Arc::new(partial_agg));
 
-        let final_agg = HashAggregateExec::try_new(
+        let final_agg = AggregateExec::try_new(
             AggregateMode::Final,
             vec![],
             vec![count_expr(None, None)],
@@ -418,7 +415,7 @@ mod tests {
         let source = mock_data()?;
         let schema = source.schema();
 
-        let partial_agg = HashAggregateExec::try_new(
+        let partial_agg = AggregateExec::try_new(
             AggregateMode::Partial,
             vec![],
             vec![count_expr(Some(&schema), Some("a"))],
@@ -429,7 +426,7 @@ mod tests {
         // We introduce an intermediate optimization step between the partial and final aggregtator
         let coalesce = CoalescePartitionsExec::new(Arc::new(partial_agg));
 
-        let final_agg = HashAggregateExec::try_new(
+        let final_agg = AggregateExec::try_new(
             AggregateMode::Final,
             vec![],
             vec![count_expr(Some(&schema), Some("a"))],
@@ -458,7 +455,7 @@ mod tests {
             source,
         )?);
 
-        let partial_agg = HashAggregateExec::try_new(
+        let partial_agg = AggregateExec::try_new(
             AggregateMode::Partial,
             vec![],
             vec![count_expr(None, None)],
@@ -466,7 +463,7 @@ mod tests {
             Arc::clone(&schema),
         )?;
 
-        let final_agg = HashAggregateExec::try_new(
+        let final_agg = AggregateExec::try_new(
             AggregateMode::Final,
             vec![],
             vec![count_expr(None, None)],
@@ -479,7 +476,7 @@ mod tests {
             AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?;
 
         // check that the original ExecutionPlan was not replaced
-        assert!(optimized.as_any().is::<HashAggregateExec>());
+        assert!(optimized.as_any().is::<AggregateExec>());
 
         Ok(())
     }
@@ -500,7 +497,7 @@ mod tests {
             source,
         )?);
 
-        let partial_agg = HashAggregateExec::try_new(
+        let partial_agg = AggregateExec::try_new(
             AggregateMode::Partial,
             vec![],
             vec![count_expr(Some(&schema), Some("a"))],
@@ -508,7 +505,7 @@ mod tests {
             Arc::clone(&schema),
         )?;
 
-        let final_agg = HashAggregateExec::try_new(
+        let final_agg = AggregateExec::try_new(
             AggregateMode::Final,
             vec![],
             vec![count_expr(Some(&schema), Some("a"))],
@@ -521,7 +518,7 @@ mod tests {
             AggregateStatistics::new().optimize(Arc::new(final_agg), &conf)?;
 
         // check that the original ExecutionPlan was not replaced
-        assert!(optimized.as_any().is::<HashAggregateExec>());
+        assert!(optimized.as_any().is::<AggregateExec>());
 
         Ok(())
     }
diff --git a/datafusion/core/src/physical_optimizer/repartition.rs b/datafusion/core/src/physical_optimizer/repartition.rs
index 2506348fe7a05..cab7ec5d4a43c 100644
--- a/datafusion/core/src/physical_optimizer/repartition.rs
+++ b/datafusion/core/src/physical_optimizer/repartition.rs
@@ -241,10 +241,10 @@ mod tests {
 
     use super::*;
     use crate::datasource::listing::PartitionedFile;
+    use crate::physical_plan::aggregates::{AggregateExec, AggregateMode};
     use crate::physical_plan::expressions::{col, PhysicalSortExpr};
     use crate::physical_plan::file_format::{FileScanConfig, ParquetExec};
     use crate::physical_plan::filter::FilterExec;
-    use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
     use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
     use crate::physical_plan::projection::ProjectionExec;
     use crate::physical_plan::sorts::sort::SortExec;
@@ -303,12 +303,12 @@ mod tests {
     fn hash_aggregate(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
         let schema = schema();
         Arc::new(
-            HashAggregateExec::try_new(
+            AggregateExec::try_new(
                 AggregateMode::Final,
                 vec![],
                 vec![],
                 Arc::new(
-                    HashAggregateExec::try_new(
+                    AggregateExec::try_new(
                         AggregateMode::Partial,
                         vec![],
                         vec![],
diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs
new file mode 100644
index 0000000000000..85e82f14c55d5
--- /dev/null
+++ b/datafusion/core/src/physical_plan/aggregates/hash.rs
@@ -0,0 +1,477 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the execution plan for the hash aggregate operation
+
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use std::vec;
+
+use ahash::RandomState;
+use futures::{
+    ready,
+    stream::{Stream, StreamExt},
+};
+
+use crate::error::Result;
+use crate::physical_plan::aggregates::{AccumulatorItem, AggregateMode};
+use crate::physical_plan::hash_utils::create_hashes;
+use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
+use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr};
+use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream};
+use crate::scalar::ScalarValue;
+
+use arrow::{array::ArrayRef, compute, compute::cast};
+use arrow::{
+    array::{Array, UInt32Builder},
+    error::{ArrowError, Result as ArrowResult},
+};
+use arrow::{
+    datatypes::{Schema, SchemaRef},
+    record_batch::RecordBatch,
+};
+use hashbrown::raw::RawTable;
+
+/*
+The architecture is the following:
+
+1. An accumulator has state that is updated on each batch.
+2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row
+3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch.
+4. The state's RecordBatch is `merge`d to a new state
+5. The state is mapped to the final value
+
+Why:
+
+* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array`
+* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge
+* It uses Arrow's native dynamically typed object, `Array`.
+* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant.
+
+Example: average
+
+* the state is `n: u32` and `sum: f64`
+* For every batch, we update them accordingly.
+* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]`
+* The RecordBatch is (sent back / transmitted over network)
+* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns.
+* Finally, `get_value` returns an array with one entry computed from the state
+*/
+pub(crate) struct GroupedHashAggregateStream {
+    schema: SchemaRef,
+    input: SendableRecordBatchStream,
+    mode: AggregateMode,
+    accumulators: Accumulators,
+    aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
+
+    aggr_expr: Vec<Arc<dyn AggregateExpr>>,
+    group_expr: Vec<Arc<dyn PhysicalExpr>>,
+
+    baseline_metrics: BaselineMetrics,
+    random_state: RandomState,
+    finished: bool,
+}
+
+impl GroupedHashAggregateStream {
+    /// Create a new HashAggregateStream
+    pub fn new(
+        mode: AggregateMode,
+        schema: SchemaRef,
+        group_expr: Vec<Arc<dyn PhysicalExpr>>,
+        aggr_expr: Vec<Arc<dyn AggregateExpr>>,
+        input: SendableRecordBatchStream,
+        baseline_metrics: BaselineMetrics,
+    ) -> Result<Self> {
+        let timer = baseline_metrics.elapsed_compute().timer();
+
+        // The expressions to evaluate the batch, one vec of expressions per aggregation.
+        // Assume create_schema() always put group columns in front of aggr columns, we set
+        // col_idx_base to group expression count.
+        let aggregate_expressions =
+            aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?;
+
+        timer.done();
+
+        Ok(Self {
+            schema,
+            mode,
+            input,
+            aggr_expr,
+            group_expr,
+            baseline_metrics,
+            aggregate_expressions,
+            accumulators: Default::default(),
+            random_state: Default::default(),
+            finished: false,
+        })
+    }
+}
+
+impl Stream for GroupedHashAggregateStream {
+    type Item = ArrowResult<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let this = &mut *self;
+        if this.finished {
+            return Poll::Ready(None);
+        }
+
+        let elapsed_compute = this.baseline_metrics.elapsed_compute();
+
+        loop {
+            let result = match ready!(this.input.poll_next_unpin(cx)) {
+                Some(Ok(batch)) => {
+                    let timer = elapsed_compute.timer();
+                    let result = group_aggregate_batch(
+                        &this.mode,
+                        &this.random_state,
+                        &this.group_expr,
+                        &this.aggr_expr,
+                        batch,
+                        &mut this.accumulators,
+                        &this.aggregate_expressions,
+                    );
+
+                    timer.done();
+
+                    match result {
+                        Ok(_) => continue,
+                        Err(e) => Err(ArrowError::ExternalError(Box::new(e))),
+                    }
+                }
+                Some(Err(e)) => Err(e),
+                None => {
+                    this.finished = true;
+                    let timer = this.baseline_metrics.elapsed_compute().timer();
+                    let result = create_batch_from_map(
+                        &this.mode,
+                        &this.accumulators,
+                        this.group_expr.len(),
+                        &this.schema,
+                    )
+                    .record_output(&this.baseline_metrics);
+
+                    timer.done();
+                    result
+                }
+            };
+
+            this.finished = true;
+            return Poll::Ready(Some(result));
+        }
+    }
+}
+
+impl RecordBatchStream for GroupedHashAggregateStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+/// TODO: Make this a member function of [`GroupedHashAggregateStream`]
+fn group_aggregate_batch(
+    mode: &AggregateMode,
+    random_state: &RandomState,
+    group_expr: &[Arc<dyn PhysicalExpr>],
+    aggr_expr: &[Arc<dyn AggregateExpr>],
+    batch: RecordBatch,
+    accumulators: &mut Accumulators,
+    aggregate_expressions: &[Vec<Arc<dyn PhysicalExpr>>],
+) -> Result<()> {
+    // evaluate the grouping expressions
+    let group_values = evaluate(group_expr, &batch)?;
+
+    // evaluate the aggregation expressions.
+    // We could evaluate them after the `take`, but since we need to evaluate all
+    // of them anyways, it is more performant to do it while they are together.
+    let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?;
+
+    // 1.1 construct the key from the group values
+    // 1.2 construct the mapping key if it does not exist
+    // 1.3 add the row' index to `indices`
+
+    // track which entries in `accumulators` have rows in this batch to aggregate
+    let mut groups_with_rows = vec![];
+
+    // 1.1 Calculate the group keys for the group values
+    let mut batch_hashes = vec![0; batch.num_rows()];
+    create_hashes(&group_values, random_state, &mut batch_hashes)?;
+
+    for (row, hash) in batch_hashes.into_iter().enumerate() {
+        let Accumulators { map, group_states } = accumulators;
+
+        let entry = map.get_mut(hash, |(_hash, group_idx)| {
+            // verify that a group that we are inserting with hash is
+            // actually the same key value as the group in
+            // existing_idx  (aka group_values @ row)
+            let group_state = &group_states[*group_idx];
+            group_values
+                .iter()
+                .zip(group_state.group_by_values.iter())
+                .all(|(array, scalar)| scalar.eq_array(array, row))
+        });
+
+        match entry {
+            // Existing entry for this group value
+            Some((_hash, group_idx)) => {
+                let group_state = &mut group_states[*group_idx];
+                // 1.3
+                if group_state.indices.is_empty() {
+                    groups_with_rows.push(*group_idx);
+                };
+                group_state.indices.push(row as u32); // remember this row
+            }
+            //  1.2 Need to create new entry
+            None => {
+                let accumulator_set = aggregates::create_accumulators(aggr_expr)?;
+
+                // Copy group values out of arrays into `ScalarValue`s
+                let group_by_values = group_values
+                    .iter()
+                    .map(|col| ScalarValue::try_from_array(col, row))
+                    .collect::<Result<Vec<_>>>()?;
+
+                // Add new entry to group_states and save newly created index
+                let group_state = GroupState {
+                    group_by_values: group_by_values.into_boxed_slice(),
+                    accumulator_set,
+                    indices: vec![row as u32], // 1.3
+                };
+                let group_idx = group_states.len();
+                group_states.push(group_state);
+                groups_with_rows.push(group_idx);
+
+                // for hasher function, use precomputed hash value
+                map.insert(hash, (hash, group_idx), |(hash, _group_idx)| *hash);
+            }
+        };
+    }
+
+    // Collect all indices + offsets based on keys in this vec
+    let mut batch_indices: UInt32Builder = UInt32Builder::new(0);
+    let mut offsets = vec![0];
+    let mut offset_so_far = 0;
+    for group_idx in groups_with_rows.iter() {
+        let indices = &accumulators.group_states[*group_idx].indices;
+        batch_indices.append_slice(indices)?;
+        offset_so_far += indices.len();
+        offsets.push(offset_so_far);
+    }
+    let batch_indices = batch_indices.finish();
+
+    // `Take` all values based on indices into Arrays
+    let values: Vec<Vec<Arc<dyn Array>>> = aggr_input_values
+        .iter()
+        .map(|array| {
+            array
+                .iter()
+                .map(|array| {
+                    compute::take(
+                        array.as_ref(),
+                        &batch_indices,
+                        None, // None: no index check
+                    )
+                    .unwrap()
+                })
+                .collect()
+            // 2.3
+        })
+        .collect();
+
+    // 2.1 for each key in this batch
+    // 2.2 for each aggregation
+    // 2.3 `slice` from each of its arrays the keys' values
+    // 2.4 update / merge the accumulator with the values
+    // 2.5 clear indices
+    groups_with_rows
+        .iter()
+        .zip(offsets.windows(2))
+        .try_for_each(|(group_idx, offsets)| {
+            let group_state = &mut accumulators.group_states[*group_idx];
+            // 2.2
+            group_state
+                .accumulator_set
+                .iter_mut()
+                .zip(values.iter())
+                .map(|(accumulator, aggr_array)| {
+                    (
+                        accumulator,
+                        aggr_array
+                            .iter()
+                            .map(|array| {
+                                // 2.3
+                                array.slice(offsets[0], offsets[1] - offsets[0])
+                            })
+                            .collect::<Vec<ArrayRef>>(),
+                    )
+                })
+                .try_for_each(|(accumulator, values)| match mode {
+                    AggregateMode::Partial => accumulator.update_batch(&values),
+                    AggregateMode::FinalPartitioned | AggregateMode::Final => {
+                        // note: the aggregation here is over states, not values, thus the merge
+                        accumulator.merge_batch(&values)
+                    }
+                })
+                // 2.5
+                .and({
+                    group_state.indices.clear();
+                    Ok(())
+                })
+        })?;
+
+    Ok(())
+}
+
+/// The state that is built for each output group.
+#[derive(Debug)]
+struct GroupState {
+    /// The actual group by values, one for each group column
+    group_by_values: Box<[ScalarValue]>,
+
+    // Accumulator state, one for each aggregate
+    accumulator_set: Vec<AccumulatorItem>,
+
+    /// scratch space used to collect indices for input rows in a
+    /// bach that have values to aggregate. Reset on each batch
+    indices: Vec<u32>,
+}
+
+/// The state of all the groups
+#[derive(Default)]
+struct Accumulators {
+    /// Logically maps group values to an index in `group_states`
+    ///
+    /// Uses the raw API of hashbrown to avoid actually storing the
+    /// keys in the table
+    ///
+    /// keys: u64 hashes of the GroupValue
+    /// values: (hash, index into `group_states`)
+    map: RawTable<(u64, usize)>,
+
+    /// State for each group
+    group_states: Vec<GroupState>,
+}
+
+impl std::fmt::Debug for Accumulators {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        // hashes are not store inline, so could only get values
+        let map_string = "RawTable";
+        f.debug_struct("Accumulators")
+            .field("map", &map_string)
+            .field("group_states", &self.group_states)
+            .finish()
+    }
+}
+
+/// Evaluates expressions against a record batch.
+fn evaluate(
+    expr: &[Arc<dyn PhysicalExpr>],
+    batch: &RecordBatch,
+) -> Result<Vec<ArrayRef>> {
+    expr.iter()
+        .map(|expr| expr.evaluate(batch))
+        .map(|r| r.map(|v| v.into_array(batch.num_rows())))
+        .collect::<Result<Vec<_>>>()
+}
+
+/// Evaluates expressions against a record batch.
+fn evaluate_many(
+    expr: &[Vec<Arc<dyn PhysicalExpr>>],
+    batch: &RecordBatch,
+) -> Result<Vec<Vec<ArrayRef>>> {
+    expr.iter()
+        .map(|expr| evaluate(expr, batch))
+        .collect::<Result<Vec<_>>>()
+}
+
+/// Create a RecordBatch with all group keys and accumulator' states or values.
+fn create_batch_from_map(
+    mode: &AggregateMode,
+    accumulators: &Accumulators,
+    num_group_expr: usize,
+    output_schema: &Schema,
+) -> ArrowResult<RecordBatch> {
+    if accumulators.group_states.is_empty() {
+        return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned())));
+    }
+    let accs = &accumulators.group_states[0].accumulator_set;
+    let mut acc_data_types: Vec<usize> = vec![];
+
+    // Calculate number/shape of state arrays
+    match mode {
+        AggregateMode::Partial => {
+            for acc in accs.iter() {
+                let state = acc.state()?;
+                acc_data_types.push(state.len());
+            }
+        }
+        AggregateMode::Final | AggregateMode::FinalPartitioned => {
+            acc_data_types = vec![1; accs.len()];
+        }
+    }
+
+    let mut columns = (0..num_group_expr)
+        .map(|i| {
+            ScalarValue::iter_to_array(
+                accumulators
+                    .group_states
+                    .iter()
+                    .map(|group_state| group_state.group_by_values[i].clone()),
+            )
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    // add state / evaluated arrays
+    for (x, &state_len) in acc_data_types.iter().enumerate() {
+        for y in 0..state_len {
+            match mode {
+                AggregateMode::Partial => {
+                    let res = ScalarValue::iter_to_array(
+                        accumulators.group_states.iter().map(|group_state| {
+                            let x = group_state.accumulator_set[x].state().unwrap();
+                            x[y].clone()
+                        }),
+                    )?;
+
+                    columns.push(res);
+                }
+                AggregateMode::Final | AggregateMode::FinalPartitioned => {
+                    let res = ScalarValue::iter_to_array(
+                        accumulators.group_states.iter().map(|group_state| {
+                            group_state.accumulator_set[x].evaluate().unwrap()
+                        }),
+                    )?;
+                    columns.push(res);
+                }
+            }
+        }
+    }
+
+    // cast output if needed (e.g. for types like Dictionary where
+    // the intermediate GroupByScalar type was not the same as the
+    // output
+    let columns = columns
+        .iter()
+        .zip(output_schema.fields().iter())
+        .map(|(col, desired_field)| cast(col, desired_field.data_type()))
+        .collect::<ArrowResult<Vec<_>>>()?;
+
+    RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns)
+}
diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs
index c0208b23974bd..af7df3dccfc71 100644
--- a/datafusion/core/src/physical_plan/aggregates/mod.rs
+++ b/datafusion/core/src/physical_plan/aggregates/mod.rs
@@ -17,5 +17,724 @@
 
 //! Aggregates functionalities
 
+use crate::execution::context::TaskContext;
+use crate::physical_plan::aggregates::hash::GroupedHashAggregateStream;
+use crate::physical_plan::aggregates::no_grouping::NoGroupingAggregateStream;
+use crate::physical_plan::metrics::{
+    BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet,
+};
+use crate::physical_plan::{
+    DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+    SendableRecordBatchStream, Statistics,
+};
+use arrow::array::ArrayRef;
+use arrow::datatypes::{Field, Schema, SchemaRef};
+use async_trait::async_trait;
+use datafusion_common::Result;
+use datafusion_expr::Accumulator;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::{
+    expressions, AggregateExpr, PhysicalExpr, PhysicalSortExpr,
+};
+use std::any::Any;
+use std::sync::Arc;
+
+mod hash;
+mod no_grouping;
+
 pub use datafusion_expr::AggregateFunction;
 pub use datafusion_physical_expr::expressions::create_aggregate_expr;
+
+/// Hash aggregate modes
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum AggregateMode {
+    /// Partial aggregate that can be applied in parallel across input partitions
+    Partial,
+    /// Final aggregate that produces a single partition of output
+    Final,
+    /// Final aggregate that works on pre-partitioned data.
+    ///
+    /// This requires the invariant that all rows with a particular
+    /// grouping key are in the same partitions, such as is the case
+    /// with Hash repartitioning on the group keys. If a group key is
+    /// duplicated, duplicate groups would be produced
+    FinalPartitioned,
+}
+
+/// Hash aggregate execution plan
+#[derive(Debug)]
+pub struct AggregateExec {
+    /// Aggregation mode (full, partial)
+    mode: AggregateMode,
+    /// Grouping expressions
+    group_expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
+    /// Aggregate expressions
+    aggr_expr: Vec<Arc<dyn AggregateExpr>>,
+    /// Input plan, could be a partial aggregate or the input to the aggregate
+    input: Arc<dyn ExecutionPlan>,
+    /// Schema after the aggregate is applied
+    schema: SchemaRef,
+    /// Input schema before any aggregation is applied. For partial aggregate this will be the
+    /// same as input.schema() but for the final aggregate it will be the same as the input
+    /// to the partial aggregate
+    input_schema: SchemaRef,
+    /// Execution Metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl AggregateExec {
+    /// Create a new hash aggregate execution plan
+    pub fn try_new(
+        mode: AggregateMode,
+        group_expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
+        aggr_expr: Vec<Arc<dyn AggregateExpr>>,
+        input: Arc<dyn ExecutionPlan>,
+        input_schema: SchemaRef,
+    ) -> Result<Self> {
+        let schema = create_schema(&input.schema(), &group_expr, &aggr_expr, mode)?;
+
+        let schema = Arc::new(schema);
+
+        Ok(AggregateExec {
+            mode,
+            group_expr,
+            aggr_expr,
+            input,
+            schema,
+            input_schema,
+            metrics: ExecutionPlanMetricsSet::new(),
+        })
+    }
+
+    /// Aggregation mode (full, partial)
+    pub fn mode(&self) -> &AggregateMode {
+        &self.mode
+    }
+
+    /// Grouping expressions
+    pub fn group_expr(&self) -> &[(Arc<dyn PhysicalExpr>, String)] {
+        &self.group_expr
+    }
+
+    /// Grouping expressions as they occur in the output schema
+    pub fn output_group_expr(&self) -> Vec<Arc<dyn PhysicalExpr>> {
+        // Update column indices. Since the group by columns come first in the output schema, their
+        // indices are simply 0..self.group_expr(len).
+        self.group_expr
+            .iter()
+            .enumerate()
+            .map(|(index, (_col, name))| {
+                Arc::new(expressions::Column::new(name, index)) as Arc<dyn PhysicalExpr>
+            })
+            .collect()
+    }
+
+    /// Aggregate expressions
+    pub fn aggr_expr(&self) -> &[Arc<dyn AggregateExpr>] {
+        &self.aggr_expr
+    }
+
+    /// Input plan
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    /// Get the input schema before any aggregates are applied
+    pub fn input_schema(&self) -> SchemaRef {
+        self.input_schema.clone()
+    }
+}
+
+#[async_trait]
+impl ExecutionPlan for AggregateExec {
+    /// Return a reference to Any that can be used for down-casting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    /// Get the output partitioning of this plan
+    fn output_partitioning(&self) -> Partitioning {
+        self.input.output_partitioning()
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn required_child_distribution(&self) -> Distribution {
+        match &self.mode {
+            AggregateMode::Partial => Distribution::UnspecifiedDistribution,
+            AggregateMode::FinalPartitioned => Distribution::HashPartitioned(
+                self.group_expr.iter().map(|x| x.0.clone()).collect(),
+            ),
+            AggregateMode::Final => Distribution::SinglePartition,
+        }
+    }
+
+    fn relies_on_input_order(&self) -> bool {
+        false
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![self.input.clone()]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(AggregateExec::try_new(
+            self.mode,
+            self.group_expr.clone(),
+            self.aggr_expr.clone(),
+            children[0].clone(),
+            self.input_schema.clone(),
+        )?))
+    }
+
+    async fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let input = self.input.execute(partition, context).await?;
+        let group_expr = self.group_expr.iter().map(|x| x.0.clone()).collect();
+
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+
+        if self.group_expr.is_empty() {
+            Ok(Box::pin(NoGroupingAggregateStream::new(
+                self.mode,
+                self.schema.clone(),
+                self.aggr_expr.clone(),
+                input,
+                baseline_metrics,
+            )?))
+        } else {
+            Ok(Box::pin(GroupedHashAggregateStream::new(
+                self.mode,
+                self.schema.clone(),
+                group_expr,
+                self.aggr_expr.clone(),
+                input,
+                baseline_metrics,
+            )?))
+        }
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default => {
+                write!(f, "HashAggregateExec: mode={:?}", self.mode)?;
+                let g: Vec<String> = self
+                    .group_expr
+                    .iter()
+                    .map(|(e, alias)| {
+                        let e = e.to_string();
+                        if &e != alias {
+                            format!("{} as {}", e, alias)
+                        } else {
+                            e
+                        }
+                    })
+                    .collect();
+                write!(f, ", gby=[{}]", g.join(", "))?;
+
+                let a: Vec<String> = self
+                    .aggr_expr
+                    .iter()
+                    .map(|agg| agg.name().to_string())
+                    .collect();
+                write!(f, ", aggr=[{}]", a.join(", "))?;
+            }
+        }
+        Ok(())
+    }
+
+    fn statistics(&self) -> Statistics {
+        // TODO stats: group expressions:
+        // - once expressions will be able to compute their own stats, use it here
+        // - case where we group by on a column for which with have the `distinct` stat
+        // TODO stats: aggr expression:
+        // - aggregations somtimes also preserve invariants such as min, max...
+        match self.mode {
+            AggregateMode::Final | AggregateMode::FinalPartitioned
+                if self.group_expr.is_empty() =>
+            {
+                Statistics {
+                    num_rows: Some(1),
+                    is_exact: true,
+                    ..Default::default()
+                }
+            }
+            _ => Statistics::default(),
+        }
+    }
+}
+
+fn create_schema(
+    input_schema: &Schema,
+    group_expr: &[(Arc<dyn PhysicalExpr>, String)],
+    aggr_expr: &[Arc<dyn AggregateExpr>],
+    mode: AggregateMode,
+) -> datafusion_common::Result<Schema> {
+    let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len());
+    for (expr, name) in group_expr {
+        fields.push(Field::new(
+            name,
+            expr.data_type(input_schema)?,
+            expr.nullable(input_schema)?,
+        ))
+    }
+
+    match mode {
+        AggregateMode::Partial => {
+            // in partial mode, the fields of the accumulator's state
+            for expr in aggr_expr {
+                fields.extend(expr.state_fields()?.iter().cloned())
+            }
+        }
+        AggregateMode::Final | AggregateMode::FinalPartitioned => {
+            // in final mode, the field with the final result of the accumulator
+            for expr in aggr_expr {
+                fields.push(expr.field()?)
+            }
+        }
+    }
+
+    Ok(Schema::new(fields))
+}
+
+/// returns physical expressions to evaluate against a batch
+/// The expressions are different depending on `mode`:
+/// * Partial: AggregateExpr::expressions
+/// * Final: columns of `AggregateExpr::state_fields()`
+fn aggregate_expressions(
+    aggr_expr: &[Arc<dyn AggregateExpr>],
+    mode: &AggregateMode,
+    col_idx_base: usize,
+) -> datafusion_common::Result<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
+    match mode {
+        AggregateMode::Partial => {
+            Ok(aggr_expr.iter().map(|agg| agg.expressions()).collect())
+        }
+        // in this mode, we build the merge expressions of the aggregation
+        AggregateMode::Final | AggregateMode::FinalPartitioned => {
+            let mut col_idx_base = col_idx_base;
+            Ok(aggr_expr
+                .iter()
+                .map(|agg| {
+                    let exprs = merge_expressions(col_idx_base, agg)?;
+                    col_idx_base += exprs.len();
+                    Ok(exprs)
+                })
+                .collect::<datafusion_common::Result<Vec<_>>>()?)
+        }
+    }
+}
+
+/// uses `state_fields` to build a vec of physical column expressions required to merge the
+/// AggregateExpr' accumulator's state.
+///
+/// `index_base` is the starting physical column index for the next expanded state field.
+fn merge_expressions(
+    index_base: usize,
+    expr: &Arc<dyn AggregateExpr>,
+) -> Result<Vec<Arc<dyn PhysicalExpr>>> {
+    Ok(expr
+        .state_fields()?
+        .iter()
+        .enumerate()
+        .map(|(idx, f)| {
+            Arc::new(Column::new(f.name(), index_base + idx)) as Arc<dyn PhysicalExpr>
+        })
+        .collect::<Vec<_>>())
+}
+
+pub(crate) type AccumulatorItem = Box<dyn Accumulator>;
+
+fn create_accumulators(
+    aggr_expr: &[Arc<dyn AggregateExpr>],
+) -> datafusion_common::Result<Vec<AccumulatorItem>> {
+    aggr_expr
+        .iter()
+        .map(|expr| expr.create_accumulator())
+        .collect::<datafusion_common::Result<Vec<_>>>()
+}
+
+/// returns a vector of ArrayRefs, where each entry corresponds to either the
+/// final value (mode = Final) or states (mode = Partial)
+fn finalize_aggregation(
+    accumulators: &[AccumulatorItem],
+    mode: &AggregateMode,
+) -> datafusion_common::Result<Vec<ArrayRef>> {
+    match mode {
+        AggregateMode::Partial => {
+            // build the vector of states
+            let a = accumulators
+                .iter()
+                .map(|accumulator| accumulator.state())
+                .map(|value| {
+                    value.map(|e| {
+                        e.iter().map(|v| v.to_array()).collect::<Vec<ArrayRef>>()
+                    })
+                })
+                .collect::<datafusion_common::Result<Vec<_>>>()?;
+            Ok(a.iter().flatten().cloned().collect::<Vec<_>>())
+        }
+        AggregateMode::Final | AggregateMode::FinalPartitioned => {
+            // merge the state to the final value
+            accumulators
+                .iter()
+                .map(|accumulator| accumulator.evaluate().map(|v| v.to_array()))
+                .collect::<datafusion_common::Result<Vec<ArrayRef>>>()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::execution::context::TaskContext;
+    use crate::from_slice::FromSlice;
+    use crate::physical_plan::aggregates::{AggregateExec, AggregateMode};
+    use crate::physical_plan::expressions::{col, Avg};
+    use crate::test::assert_is_pending;
+    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
+    use crate::{assert_batches_sorted_eq, physical_plan::common};
+    use arrow::array::{Float64Array, UInt32Array};
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use arrow::error::Result as ArrowResult;
+    use arrow::record_batch::RecordBatch;
+    use async_trait::async_trait;
+    use datafusion_common::{DataFusionError, Result};
+    use datafusion_physical_expr::{AggregateExpr, PhysicalExpr, PhysicalSortExpr};
+    use futures::{FutureExt, Stream};
+    use std::any::Any;
+    use std::sync::Arc;
+    use std::task::{Context, Poll};
+
+    use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+    use crate::physical_plan::{
+        ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream,
+        Statistics,
+    };
+    use crate::prelude::SessionContext;
+
+    /// some mock data to aggregates
+    fn some_data() -> (Arc<Schema>, Vec<RecordBatch>) {
+        // define a schema.
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::UInt32, false),
+            Field::new("b", DataType::Float64, false),
+        ]));
+
+        // define data.
+        (
+            schema.clone(),
+            vec![
+                RecordBatch::try_new(
+                    schema.clone(),
+                    vec![
+                        Arc::new(UInt32Array::from_slice(&[2, 3, 4, 4])),
+                        Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])),
+                    ],
+                )
+                .unwrap(),
+                RecordBatch::try_new(
+                    schema,
+                    vec![
+                        Arc::new(UInt32Array::from_slice(&[2, 3, 3, 4])),
+                        Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])),
+                    ],
+                )
+                .unwrap(),
+            ],
+        )
+    }
+
+    /// build the aggregates on the data from some_data() and check the results
+    async fn check_aggregates(input: Arc<dyn ExecutionPlan>) -> Result<()> {
+        let input_schema = input.schema();
+
+        let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
+            vec![(col("a", &input_schema)?, "a".to_string())];
+
+        let aggregates: Vec<Arc<dyn AggregateExpr>> = vec![Arc::new(Avg::new(
+            col("b", &input_schema)?,
+            "AVG(b)".to_string(),
+            DataType::Float64,
+        ))];
+
+        let session_ctx = SessionContext::new();
+        let task_ctx = session_ctx.task_ctx();
+
+        let partial_aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            groups.clone(),
+            aggregates.clone(),
+            input,
+            input_schema.clone(),
+        )?);
+
+        let result =
+            common::collect(partial_aggregate.execute(0, task_ctx.clone()).await?)
+                .await?;
+
+        let expected = vec![
+            "+---+---------------+-------------+",
+            "| a | AVG(b)[count] | AVG(b)[sum] |",
+            "+---+---------------+-------------+",
+            "| 2 | 2             | 2           |",
+            "| 3 | 3             | 7           |",
+            "| 4 | 3             | 11          |",
+            "+---+---------------+-------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+
+        let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate));
+
+        let final_group: Vec<Arc<dyn PhysicalExpr>> = (0..groups.len())
+            .map(|i| col(&groups[i].1, &input_schema))
+            .collect::<Result<_>>()?;
+
+        let merged_aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            final_group
+                .iter()
+                .enumerate()
+                .map(|(i, expr)| (expr.clone(), groups[i].1.clone()))
+                .collect(),
+            aggregates,
+            merge,
+            input_schema,
+        )?);
+
+        let result =
+            common::collect(merged_aggregate.execute(0, task_ctx.clone()).await?).await?;
+        assert_eq!(result.len(), 1);
+
+        let batch = &result[0];
+        assert_eq!(batch.num_columns(), 2);
+        assert_eq!(batch.num_rows(), 3);
+
+        let expected = vec![
+            "+---+--------------------+",
+            "| a | AVG(b)             |",
+            "+---+--------------------+",
+            "| 2 | 1                  |",
+            "| 3 | 2.3333333333333335 |", // 3, (2 + 3 + 2) / 3
+            "| 4 | 3.6666666666666665 |", // 4, (3 + 4 + 4) / 3
+            "+---+--------------------+",
+        ];
+
+        assert_batches_sorted_eq!(&expected, &result);
+
+        let metrics = merged_aggregate.metrics().unwrap();
+        let output_rows = metrics.output_rows().unwrap();
+        assert_eq!(3, output_rows);
+
+        Ok(())
+    }
+
+    /// Define a test source that can yield back to runtime before returning its first item ///
+
+    #[derive(Debug)]
+    struct TestYieldingExec {
+        /// True if this exec should yield back to runtime the first time it is polled
+        pub yield_first: bool,
+    }
+
+    #[async_trait]
+    impl ExecutionPlan for TestYieldingExec {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+        fn schema(&self) -> SchemaRef {
+            some_data().0
+        }
+
+        fn output_partitioning(&self) -> Partitioning {
+            Partitioning::UnknownPartitioning(1)
+        }
+
+        fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+            None
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            Err(DataFusionError::Internal(format!(
+                "Children cannot be replaced in {:?}",
+                self
+            )))
+        }
+
+        async fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            let stream = if self.yield_first {
+                TestYieldingStream::New
+            } else {
+                TestYieldingStream::Yielded
+            };
+
+            Ok(Box::pin(stream))
+        }
+
+        fn statistics(&self) -> Statistics {
+            let (_, batches) = some_data();
+            common::compute_record_batch_statistics(&[batches], &self.schema(), None)
+        }
+    }
+
+    /// A stream using the demo data. If inited as new, it will first yield to runtime before returning records
+    enum TestYieldingStream {
+        New,
+        Yielded,
+        ReturnedBatch1,
+        ReturnedBatch2,
+    }
+
+    impl Stream for TestYieldingStream {
+        type Item = ArrowResult<RecordBatch>;
+
+        fn poll_next(
+            mut self: std::pin::Pin<&mut Self>,
+            cx: &mut Context<'_>,
+        ) -> Poll<Option<Self::Item>> {
+            match &*self {
+                TestYieldingStream::New => {
+                    *(self.as_mut()) = TestYieldingStream::Yielded;
+                    cx.waker().wake_by_ref();
+                    Poll::Pending
+                }
+                TestYieldingStream::Yielded => {
+                    *(self.as_mut()) = TestYieldingStream::ReturnedBatch1;
+                    Poll::Ready(Some(Ok(some_data().1[0].clone())))
+                }
+                TestYieldingStream::ReturnedBatch1 => {
+                    *(self.as_mut()) = TestYieldingStream::ReturnedBatch2;
+                    Poll::Ready(Some(Ok(some_data().1[1].clone())))
+                }
+                TestYieldingStream::ReturnedBatch2 => Poll::Ready(None),
+            }
+        }
+    }
+
+    impl RecordBatchStream for TestYieldingStream {
+        fn schema(&self) -> SchemaRef {
+            some_data().0
+        }
+    }
+
+    //// Tests ////
+
+    #[tokio::test]
+    async fn aggregate_source_not_yielding() -> Result<()> {
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(TestYieldingExec { yield_first: false });
+
+        check_aggregates(input).await
+    }
+
+    #[tokio::test]
+    async fn aggregate_source_with_yielding() -> Result<()> {
+        let input: Arc<dyn ExecutionPlan> =
+            Arc::new(TestYieldingExec { yield_first: true });
+
+        check_aggregates(input).await
+    }
+
+    #[tokio::test]
+    async fn test_drop_cancel_without_groups() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let task_ctx = session_ctx.task_ctx();
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)]));
+
+        let groups = vec![];
+
+        let aggregates: Vec<Arc<dyn AggregateExpr>> = vec![Arc::new(Avg::new(
+            col("a", &schema)?,
+            "AVG(a)".to_string(),
+            DataType::Float64,
+        ))];
+
+        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
+        let refs = blocking_exec.refs();
+        let hash_aggregate_exec = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            groups.clone(),
+            aggregates.clone(),
+            blocking_exec,
+            schema,
+        )?);
+
+        let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx);
+        let mut fut = fut.boxed();
+
+        assert_is_pending(&mut fut);
+        drop(fut);
+        assert_strong_count_converges_to_zero(refs).await;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_drop_cancel_with_groups() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let task_ctx = session_ctx.task_ctx();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Float32, true),
+            Field::new("b", DataType::Float32, true),
+        ]));
+
+        let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
+            vec![(col("a", &schema)?, "a".to_string())];
+
+        let aggregates: Vec<Arc<dyn AggregateExpr>> = vec![Arc::new(Avg::new(
+            col("b", &schema)?,
+            "AVG(b)".to_string(),
+            DataType::Float64,
+        ))];
+
+        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
+        let refs = blocking_exec.refs();
+        let hash_aggregate_exec = Arc::new(AggregateExec::try_new(
+            AggregateMode::Partial,
+            groups.clone(),
+            aggregates.clone(),
+            blocking_exec,
+            schema,
+        )?);
+
+        let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx);
+        let mut fut = fut.boxed();
+
+        assert_is_pending(&mut fut);
+        drop(fut);
+        assert_strong_count_converges_to_zero(refs).await;
+
+        Ok(())
+    }
+}
diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs
new file mode 100644
index 0000000000000..3398eba3cfa82
--- /dev/null
+++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Aggregate without grouping columns
+
+use crate::physical_plan::aggregates::{
+    aggregate_expressions, create_accumulators, finalize_aggregation, AccumulatorItem,
+    AggregateMode,
+};
+use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
+use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream};
+use arrow::datatypes::SchemaRef;
+use arrow::error::{ArrowError, Result as ArrowResult};
+use arrow::record_batch::RecordBatch;
+use datafusion_common::Result;
+use datafusion_physical_expr::{AggregateExpr, PhysicalExpr};
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use futures::{
+    ready,
+    stream::{Stream, StreamExt},
+};
+
+/// stream struct for aggregation without grouping columns
+pub(crate) struct NoGroupingAggregateStream {
+    schema: SchemaRef,
+    mode: AggregateMode,
+    input: SendableRecordBatchStream,
+    baseline_metrics: BaselineMetrics,
+    aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
+    accumulators: Vec<AccumulatorItem>,
+    finished: bool,
+}
+
+impl NoGroupingAggregateStream {
+    /// Create a new HashAggregateStream
+    pub fn new(
+        mode: AggregateMode,
+        schema: SchemaRef,
+        aggr_expr: Vec<Arc<dyn AggregateExpr>>,
+        input: SendableRecordBatchStream,
+        baseline_metrics: BaselineMetrics,
+    ) -> datafusion_common::Result<Self> {
+        let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode, 0)?;
+        let accumulators = create_accumulators(&aggr_expr)?;
+
+        Ok(Self {
+            schema,
+            mode,
+            input,
+            baseline_metrics,
+            aggregate_expressions,
+            accumulators,
+            finished: false,
+        })
+    }
+}
+
+impl Stream for NoGroupingAggregateStream {
+    type Item = ArrowResult<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let this = &mut *self;
+        if this.finished {
+            return Poll::Ready(None);
+        }
+
+        let elapsed_compute = this.baseline_metrics.elapsed_compute();
+
+        loop {
+            let result = match ready!(this.input.poll_next_unpin(cx)) {
+                Some(Ok(batch)) => {
+                    let timer = elapsed_compute.timer();
+                    let result = aggregate_batch(
+                        &this.mode,
+                        &batch,
+                        &mut this.accumulators,
+                        &this.aggregate_expressions,
+                    );
+
+                    timer.done();
+
+                    match result {
+                        Ok(_) => continue,
+                        Err(e) => Err(ArrowError::ExternalError(Box::new(e))),
+                    }
+                }
+                Some(Err(e)) => Err(e),
+                None => {
+                    this.finished = true;
+                    let timer = this.baseline_metrics.elapsed_compute().timer();
+                    let result = finalize_aggregation(&this.accumulators, &this.mode)
+                        .map_err(|e| ArrowError::ExternalError(Box::new(e)))
+                        .and_then(|columns| {
+                            RecordBatch::try_new(this.schema.clone(), columns)
+                        })
+                        .record_output(&this.baseline_metrics);
+
+                    timer.done();
+                    result
+                }
+            };
+
+            this.finished = true;
+            return Poll::Ready(Some(result));
+        }
+    }
+}
+
+impl RecordBatchStream for NoGroupingAggregateStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+/// TODO: Make this a member function
+fn aggregate_batch(
+    mode: &AggregateMode,
+    batch: &RecordBatch,
+    accumulators: &mut [AccumulatorItem],
+    expressions: &[Vec<Arc<dyn PhysicalExpr>>],
+) -> Result<()> {
+    // 1.1 iterate accumulators and respective expressions together
+    // 1.2 evaluate expressions
+    // 1.3 update / merge accumulators with the expressions' values
+
+    // 1.1
+    accumulators
+        .iter_mut()
+        .zip(expressions)
+        .try_for_each(|(accum, expr)| {
+            // 1.2
+            let values = &expr
+                .iter()
+                .map(|e| e.evaluate(batch))
+                .map(|r| r.map(|v| v.into_array(batch.num_rows())))
+                .collect::<Result<Vec<_>>>()?;
+
+            // 1.3
+            match mode {
+                AggregateMode::Partial => accum.update_batch(values),
+                AggregateMode::Final | AggregateMode::FinalPartitioned => {
+                    accum.merge_batch(values)
+                }
+            }
+        })
+}
diff --git a/datafusion/core/src/physical_plan/hash_aggregate.rs b/datafusion/core/src/physical_plan/hash_aggregate.rs
deleted file mode 100644
index 6431745579975..0000000000000
--- a/datafusion/core/src/physical_plan/hash_aggregate.rs
+++ /dev/null
@@ -1,1299 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Defines the execution plan for the hash aggregate operation
-
-use std::any::Any;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-use std::vec;
-
-use ahash::RandomState;
-use futures::{
-    ready,
-    stream::{Stream, StreamExt},
-};
-
-use crate::error::Result;
-use crate::physical_plan::hash_utils::create_hashes;
-use crate::physical_plan::{
-    Accumulator, AggregateExpr, DisplayFormatType, Distribution, ExecutionPlan,
-    Partitioning, PhysicalExpr,
-};
-use crate::scalar::ScalarValue;
-
-use arrow::{array::ArrayRef, compute, compute::cast};
-use arrow::{
-    array::{Array, UInt32Builder},
-    error::{ArrowError, Result as ArrowResult},
-};
-use arrow::{
-    datatypes::{Field, Schema, SchemaRef},
-    record_batch::RecordBatch,
-};
-use hashbrown::raw::RawTable;
-
-use crate::execution::context::TaskContext;
-use async_trait::async_trait;
-
-use super::expressions::PhysicalSortExpr;
-use super::metrics::{
-    BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput,
-};
-use super::Statistics;
-use super::{expressions::Column, RecordBatchStream, SendableRecordBatchStream};
-
-/// Hash aggregate modes
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub enum AggregateMode {
-    /// Partial aggregate that can be applied in parallel across input partitions
-    Partial,
-    /// Final aggregate that produces a single partition of output
-    Final,
-    /// Final aggregate that works on pre-partitioned data.
-    ///
-    /// This requires the invariant that all rows with a particular
-    /// grouping key are in the same partitions, such as is the case
-    /// with Hash repartitioning on the group keys. If a group key is
-    /// duplicated, duplicate groups would be produced
-    FinalPartitioned,
-}
-
-/// Hash aggregate execution plan
-#[derive(Debug)]
-pub struct HashAggregateExec {
-    /// Aggregation mode (full, partial)
-    mode: AggregateMode,
-    /// Grouping expressions
-    group_expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
-    /// Aggregate expressions
-    aggr_expr: Vec<Arc<dyn AggregateExpr>>,
-    /// Input plan, could be a partial aggregate or the input to the aggregate
-    input: Arc<dyn ExecutionPlan>,
-    /// Schema after the aggregate is applied
-    schema: SchemaRef,
-    /// Input schema before any aggregation is applied. For partial aggregate this will be the
-    /// same as input.schema() but for the final aggregate it will be the same as the input
-    /// to the partial aggregate
-    input_schema: SchemaRef,
-    /// Execution Metrics
-    metrics: ExecutionPlanMetricsSet,
-}
-
-fn create_schema(
-    input_schema: &Schema,
-    group_expr: &[(Arc<dyn PhysicalExpr>, String)],
-    aggr_expr: &[Arc<dyn AggregateExpr>],
-    mode: AggregateMode,
-) -> Result<Schema> {
-    let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len());
-    for (expr, name) in group_expr {
-        fields.push(Field::new(
-            name,
-            expr.data_type(input_schema)?,
-            expr.nullable(input_schema)?,
-        ))
-    }
-
-    match mode {
-        AggregateMode::Partial => {
-            // in partial mode, the fields of the accumulator's state
-            for expr in aggr_expr {
-                fields.extend(expr.state_fields()?.iter().cloned())
-            }
-        }
-        AggregateMode::Final | AggregateMode::FinalPartitioned => {
-            // in final mode, the field with the final result of the accumulator
-            for expr in aggr_expr {
-                fields.push(expr.field()?)
-            }
-        }
-    }
-
-    Ok(Schema::new(fields))
-}
-
-impl HashAggregateExec {
-    /// Create a new hash aggregate execution plan
-    pub fn try_new(
-        mode: AggregateMode,
-        group_expr: Vec<(Arc<dyn PhysicalExpr>, String)>,
-        aggr_expr: Vec<Arc<dyn AggregateExpr>>,
-        input: Arc<dyn ExecutionPlan>,
-        input_schema: SchemaRef,
-    ) -> Result<Self> {
-        let schema = create_schema(&input.schema(), &group_expr, &aggr_expr, mode)?;
-
-        let schema = Arc::new(schema);
-
-        Ok(HashAggregateExec {
-            mode,
-            group_expr,
-            aggr_expr,
-            input,
-            schema,
-            input_schema,
-            metrics: ExecutionPlanMetricsSet::new(),
-        })
-    }
-
-    /// Aggregation mode (full, partial)
-    pub fn mode(&self) -> &AggregateMode {
-        &self.mode
-    }
-
-    /// Grouping expressions
-    pub fn group_expr(&self) -> &[(Arc<dyn PhysicalExpr>, String)] {
-        &self.group_expr
-    }
-
-    /// Grouping expressions as they occur in the output schema
-    pub fn output_group_expr(&self) -> Vec<Arc<dyn PhysicalExpr>> {
-        // Update column indices. Since the group by columns come first in the output schema, their
-        // indices are simply 0..self.group_expr(len).
-        self.group_expr
-            .iter()
-            .enumerate()
-            .map(|(index, (_col, name))| {
-                Arc::new(Column::new(name, index)) as Arc<dyn PhysicalExpr>
-            })
-            .collect()
-    }
-
-    /// Aggregate expressions
-    pub fn aggr_expr(&self) -> &[Arc<dyn AggregateExpr>] {
-        &self.aggr_expr
-    }
-
-    /// Input plan
-    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
-        &self.input
-    }
-
-    /// Get the input schema before any aggregates are applied
-    pub fn input_schema(&self) -> SchemaRef {
-        self.input_schema.clone()
-    }
-}
-
-#[async_trait]
-impl ExecutionPlan for HashAggregateExec {
-    /// Return a reference to Any that can be used for downcasting
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-
-    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
-        vec![self.input.clone()]
-    }
-
-    fn required_child_distribution(&self) -> Distribution {
-        match &self.mode {
-            AggregateMode::Partial => Distribution::UnspecifiedDistribution,
-            AggregateMode::FinalPartitioned => Distribution::HashPartitioned(
-                self.group_expr.iter().map(|x| x.0.clone()).collect(),
-            ),
-            AggregateMode::Final => Distribution::SinglePartition,
-        }
-    }
-
-    /// Get the output partitioning of this plan
-    fn output_partitioning(&self) -> Partitioning {
-        self.input.output_partitioning()
-    }
-
-    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
-        None
-    }
-
-    fn relies_on_input_order(&self) -> bool {
-        false
-    }
-
-    async fn execute(
-        &self,
-        partition: usize,
-        context: Arc<TaskContext>,
-    ) -> Result<SendableRecordBatchStream> {
-        let input = self.input.execute(partition, context).await?;
-        let group_expr = self.group_expr.iter().map(|x| x.0.clone()).collect();
-
-        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
-
-        if self.group_expr.is_empty() {
-            Ok(Box::pin(HashAggregateStream::new(
-                self.mode,
-                self.schema.clone(),
-                self.aggr_expr.clone(),
-                input,
-                baseline_metrics,
-            )?))
-        } else {
-            Ok(Box::pin(GroupedHashAggregateStream::new(
-                self.mode,
-                self.schema.clone(),
-                group_expr,
-                self.aggr_expr.clone(),
-                input,
-                baseline_metrics,
-            )?))
-        }
-    }
-
-    fn with_new_children(
-        self: Arc<Self>,
-        children: Vec<Arc<dyn ExecutionPlan>>,
-    ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(HashAggregateExec::try_new(
-            self.mode,
-            self.group_expr.clone(),
-            self.aggr_expr.clone(),
-            children[0].clone(),
-            self.input_schema.clone(),
-        )?))
-    }
-
-    fn metrics(&self) -> Option<MetricsSet> {
-        Some(self.metrics.clone_inner())
-    }
-
-    fn fmt_as(
-        &self,
-        t: DisplayFormatType,
-        f: &mut std::fmt::Formatter,
-    ) -> std::fmt::Result {
-        match t {
-            DisplayFormatType::Default => {
-                write!(f, "HashAggregateExec: mode={:?}", self.mode)?;
-                let g: Vec<String> = self
-                    .group_expr
-                    .iter()
-                    .map(|(e, alias)| {
-                        let e = e.to_string();
-                        if &e != alias {
-                            format!("{} as {}", e, alias)
-                        } else {
-                            e
-                        }
-                    })
-                    .collect();
-                write!(f, ", gby=[{}]", g.join(", "))?;
-
-                let a: Vec<String> = self
-                    .aggr_expr
-                    .iter()
-                    .map(|agg| agg.name().to_string())
-                    .collect();
-                write!(f, ", aggr=[{}]", a.join(", "))?;
-            }
-        }
-        Ok(())
-    }
-
-    fn statistics(&self) -> Statistics {
-        // TODO stats: group expressions:
-        // - once expressions will be able to compute their own stats, use it here
-        // - case where we group by on a column for which with have the `distinct` stat
-        // TODO stats: aggr expression:
-        // - aggregations somtimes also preserve invariants such as min, max...
-        match self.mode {
-            AggregateMode::Final | AggregateMode::FinalPartitioned
-                if self.group_expr.is_empty() =>
-            {
-                Statistics {
-                    num_rows: Some(1),
-                    is_exact: true,
-                    ..Default::default()
-                }
-            }
-            _ => Statistics::default(),
-        }
-    }
-}
-
-/*
-The architecture is the following:
-
-1. An accumulator has state that is updated on each batch.
-2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row
-3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch.
-4. The state's RecordBatch is `merge`d to a new state
-5. The state is mapped to the final value
-
-Why:
-
-* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array`
-* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge
-* It uses Arrow's native dynamically typed object, `Array`.
-* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant.
-
-Example: average
-
-* the state is `n: u32` and `sum: f64`
-* For every batch, we update them accordingly.
-* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]`
-* The RecordBatch is (sent back / transmitted over network)
-* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns.
-* Finally, `get_value` returns an array with one entry computed from the state
-*/
-struct GroupedHashAggregateStream {
-    schema: SchemaRef,
-    input: SendableRecordBatchStream,
-    mode: AggregateMode,
-    accumulators: Accumulators,
-    aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-
-    aggr_expr: Vec<Arc<dyn AggregateExpr>>,
-    group_expr: Vec<Arc<dyn PhysicalExpr>>,
-
-    baseline_metrics: BaselineMetrics,
-    random_state: RandomState,
-    finished: bool,
-}
-
-impl GroupedHashAggregateStream {
-    /// Create a new HashAggregateStream
-    pub fn new(
-        mode: AggregateMode,
-        schema: SchemaRef,
-        group_expr: Vec<Arc<dyn PhysicalExpr>>,
-        aggr_expr: Vec<Arc<dyn AggregateExpr>>,
-        input: SendableRecordBatchStream,
-        baseline_metrics: BaselineMetrics,
-    ) -> Result<Self> {
-        let timer = baseline_metrics.elapsed_compute().timer();
-
-        // The expressions to evaluate the batch, one vec of expressions per aggregation.
-        // Assume create_schema() always put group columns in front of aggr columns, we set
-        // col_idx_base to group expression count.
-        let aggregate_expressions =
-            aggregate_expressions(&aggr_expr, &mode, group_expr.len())?;
-
-        timer.done();
-
-        Ok(Self {
-            schema,
-            mode,
-            input,
-            aggr_expr,
-            group_expr,
-            baseline_metrics,
-            aggregate_expressions,
-            accumulators: Default::default(),
-            random_state: Default::default(),
-            finished: false,
-        })
-    }
-}
-
-impl Stream for GroupedHashAggregateStream {
-    type Item = ArrowResult<RecordBatch>;
-
-    fn poll_next(
-        mut self: std::pin::Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        let this = &mut *self;
-        if this.finished {
-            return Poll::Ready(None);
-        }
-
-        let elapsed_compute = this.baseline_metrics.elapsed_compute();
-
-        loop {
-            let result = match ready!(this.input.poll_next_unpin(cx)) {
-                Some(Ok(batch)) => {
-                    let timer = elapsed_compute.timer();
-                    let result = group_aggregate_batch(
-                        &this.mode,
-                        &this.random_state,
-                        &this.group_expr,
-                        &this.aggr_expr,
-                        batch,
-                        &mut this.accumulators,
-                        &this.aggregate_expressions,
-                    );
-
-                    timer.done();
-
-                    match result {
-                        Ok(_) => continue,
-                        Err(e) => Err(ArrowError::ExternalError(Box::new(e))),
-                    }
-                }
-                Some(Err(e)) => Err(e),
-                None => {
-                    this.finished = true;
-                    let timer = this.baseline_metrics.elapsed_compute().timer();
-                    let result = create_batch_from_map(
-                        &this.mode,
-                        &this.accumulators,
-                        this.group_expr.len(),
-                        &this.schema,
-                    )
-                    .record_output(&this.baseline_metrics);
-
-                    timer.done();
-                    result
-                }
-            };
-
-            this.finished = true;
-            return Poll::Ready(Some(result));
-        }
-    }
-}
-
-impl RecordBatchStream for GroupedHashAggregateStream {
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-}
-
-/// TODO: Make this a member function of [`GroupedHashAggregateStream`]
-fn group_aggregate_batch(
-    mode: &AggregateMode,
-    random_state: &RandomState,
-    group_expr: &[Arc<dyn PhysicalExpr>],
-    aggr_expr: &[Arc<dyn AggregateExpr>],
-    batch: RecordBatch,
-    accumulators: &mut Accumulators,
-    aggregate_expressions: &[Vec<Arc<dyn PhysicalExpr>>],
-) -> Result<()> {
-    // evaluate the grouping expressions
-    let group_values = evaluate(group_expr, &batch)?;
-
-    // evaluate the aggregation expressions.
-    // We could evaluate them after the `take`, but since we need to evaluate all
-    // of them anyways, it is more performant to do it while they are together.
-    let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?;
-
-    // 1.1 construct the key from the group values
-    // 1.2 construct the mapping key if it does not exist
-    // 1.3 add the row' index to `indices`
-
-    // track which entries in `accumulators` have rows in this batch to aggregate
-    let mut groups_with_rows = vec![];
-
-    // 1.1 Calculate the group keys for the group values
-    let mut batch_hashes = vec![0; batch.num_rows()];
-    create_hashes(&group_values, random_state, &mut batch_hashes)?;
-
-    for (row, hash) in batch_hashes.into_iter().enumerate() {
-        let Accumulators { map, group_states } = accumulators;
-
-        let entry = map.get_mut(hash, |(_hash, group_idx)| {
-            // verify that a group that we are inserting with hash is
-            // actually the same key value as the group in
-            // existing_idx  (aka group_values @ row)
-            let group_state = &group_states[*group_idx];
-            group_values
-                .iter()
-                .zip(group_state.group_by_values.iter())
-                .all(|(array, scalar)| scalar.eq_array(array, row))
-        });
-
-        match entry {
-            // Existing entry for this group value
-            Some((_hash, group_idx)) => {
-                let group_state = &mut group_states[*group_idx];
-                // 1.3
-                if group_state.indices.is_empty() {
-                    groups_with_rows.push(*group_idx);
-                };
-                group_state.indices.push(row as u32); // remember this row
-            }
-            //  1.2 Need to create new entry
-            None => {
-                let accumulator_set = create_accumulators(aggr_expr)?;
-
-                // Copy group values out of arrays into `ScalarValue`s
-                let group_by_values = group_values
-                    .iter()
-                    .map(|col| ScalarValue::try_from_array(col, row))
-                    .collect::<Result<Vec<_>>>()?;
-
-                // Add new entry to group_states and save newly created index
-                let group_state = GroupState {
-                    group_by_values: group_by_values.into_boxed_slice(),
-                    accumulator_set,
-                    indices: vec![row as u32], // 1.3
-                };
-                let group_idx = group_states.len();
-                group_states.push(group_state);
-                groups_with_rows.push(group_idx);
-
-                // for hasher function, use precomputed hash value
-                map.insert(hash, (hash, group_idx), |(hash, _group_idx)| *hash);
-            }
-        };
-    }
-
-    // Collect all indices + offsets based on keys in this vec
-    let mut batch_indices: UInt32Builder = UInt32Builder::new(0);
-    let mut offsets = vec![0];
-    let mut offset_so_far = 0;
-    for group_idx in groups_with_rows.iter() {
-        let indices = &accumulators.group_states[*group_idx].indices;
-        batch_indices.append_slice(indices)?;
-        offset_so_far += indices.len();
-        offsets.push(offset_so_far);
-    }
-    let batch_indices = batch_indices.finish();
-
-    // `Take` all values based on indices into Arrays
-    let values: Vec<Vec<Arc<dyn Array>>> = aggr_input_values
-        .iter()
-        .map(|array| {
-            array
-                .iter()
-                .map(|array| {
-                    compute::take(
-                        array.as_ref(),
-                        &batch_indices,
-                        None, // None: no index check
-                    )
-                    .unwrap()
-                })
-                .collect()
-            // 2.3
-        })
-        .collect();
-
-    // 2.1 for each key in this batch
-    // 2.2 for each aggregation
-    // 2.3 `slice` from each of its arrays the keys' values
-    // 2.4 update / merge the accumulator with the values
-    // 2.5 clear indices
-    groups_with_rows
-        .iter()
-        .zip(offsets.windows(2))
-        .try_for_each(|(group_idx, offsets)| {
-            let group_state = &mut accumulators.group_states[*group_idx];
-            // 2.2
-            group_state
-                .accumulator_set
-                .iter_mut()
-                .zip(values.iter())
-                .map(|(accumulator, aggr_array)| {
-                    (
-                        accumulator,
-                        aggr_array
-                            .iter()
-                            .map(|array| {
-                                // 2.3
-                                array.slice(offsets[0], offsets[1] - offsets[0])
-                            })
-                            .collect::<Vec<ArrayRef>>(),
-                    )
-                })
-                .try_for_each(|(accumulator, values)| match mode {
-                    AggregateMode::Partial => accumulator.update_batch(&values),
-                    AggregateMode::FinalPartitioned | AggregateMode::Final => {
-                        // note: the aggregation here is over states, not values, thus the merge
-                        accumulator.merge_batch(&values)
-                    }
-                })
-                // 2.5
-                .and({
-                    group_state.indices.clear();
-                    Ok(())
-                })
-        })?;
-
-    Ok(())
-}
-
-type AccumulatorItem = Box<dyn Accumulator>;
-
-/// The state that is built for each output group.
-#[derive(Debug)]
-struct GroupState {
-    /// The actual group by values, one for each group column
-    group_by_values: Box<[ScalarValue]>,
-
-    // Accumulator state, one for each aggregate
-    accumulator_set: Vec<AccumulatorItem>,
-
-    /// scratch space used to collect indices for input rows in a
-    /// bach that have values to aggregate. Reset on each batch
-    indices: Vec<u32>,
-}
-
-/// The state of all the groups
-#[derive(Default)]
-struct Accumulators {
-    /// Logically maps group values to an index in `group_states`
-    ///
-    /// Uses the raw API of hashbrown to avoid actually storing the
-    /// keys in the table
-    ///
-    /// keys: u64 hashes of the GroupValue
-    /// values: (hash, index into `group_states`)
-    map: RawTable<(u64, usize)>,
-
-    /// State for each group
-    group_states: Vec<GroupState>,
-}
-
-impl std::fmt::Debug for Accumulators {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        // hashes are not store inline, so could only get values
-        let map_string = "RawTable";
-        f.debug_struct("Accumulators")
-            .field("map", &map_string)
-            .field("group_states", &self.group_states)
-            .finish()
-    }
-}
-
-/// Evaluates expressions against a record batch.
-fn evaluate(
-    expr: &[Arc<dyn PhysicalExpr>],
-    batch: &RecordBatch,
-) -> Result<Vec<ArrayRef>> {
-    expr.iter()
-        .map(|expr| expr.evaluate(batch))
-        .map(|r| r.map(|v| v.into_array(batch.num_rows())))
-        .collect::<Result<Vec<_>>>()
-}
-
-/// Evaluates expressions against a record batch.
-fn evaluate_many(
-    expr: &[Vec<Arc<dyn PhysicalExpr>>],
-    batch: &RecordBatch,
-) -> Result<Vec<Vec<ArrayRef>>> {
-    expr.iter()
-        .map(|expr| evaluate(expr, batch))
-        .collect::<Result<Vec<_>>>()
-}
-
-/// uses `state_fields` to build a vec of physical column expressions required to merge the
-/// AggregateExpr' accumulator's state.
-///
-/// `index_base` is the starting physical column index for the next expanded state field.
-fn merge_expressions(
-    index_base: usize,
-    expr: &Arc<dyn AggregateExpr>,
-) -> Result<Vec<Arc<dyn PhysicalExpr>>> {
-    Ok(expr
-        .state_fields()?
-        .iter()
-        .enumerate()
-        .map(|(idx, f)| {
-            Arc::new(Column::new(f.name(), index_base + idx)) as Arc<dyn PhysicalExpr>
-        })
-        .collect::<Vec<_>>())
-}
-
-/// returns physical expressions to evaluate against a batch
-/// The expressions are different depending on `mode`:
-/// * Partial: AggregateExpr::expressions
-/// * Final: columns of `AggregateExpr::state_fields()`
-fn aggregate_expressions(
-    aggr_expr: &[Arc<dyn AggregateExpr>],
-    mode: &AggregateMode,
-    col_idx_base: usize,
-) -> Result<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
-    match mode {
-        AggregateMode::Partial => {
-            Ok(aggr_expr.iter().map(|agg| agg.expressions()).collect())
-        }
-        // in this mode, we build the merge expressions of the aggregation
-        AggregateMode::Final | AggregateMode::FinalPartitioned => {
-            let mut col_idx_base = col_idx_base;
-            Ok(aggr_expr
-                .iter()
-                .map(|agg| {
-                    let exprs = merge_expressions(col_idx_base, agg)?;
-                    col_idx_base += exprs.len();
-                    Ok(exprs)
-                })
-                .collect::<Result<Vec<_>>>()?)
-        }
-    }
-}
-
-/// stream struct for hash aggregation
-pub struct HashAggregateStream {
-    schema: SchemaRef,
-    mode: AggregateMode,
-    input: SendableRecordBatchStream,
-    baseline_metrics: BaselineMetrics,
-    aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
-    accumulators: Vec<AccumulatorItem>,
-    finished: bool,
-}
-
-impl HashAggregateStream {
-    /// Create a new HashAggregateStream
-    pub fn new(
-        mode: AggregateMode,
-        schema: SchemaRef,
-        aggr_expr: Vec<Arc<dyn AggregateExpr>>,
-        input: SendableRecordBatchStream,
-        baseline_metrics: BaselineMetrics,
-    ) -> Result<Self> {
-        let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode, 0)?;
-        let accumulators = create_accumulators(&aggr_expr)?;
-
-        Ok(Self {
-            schema,
-            mode,
-            input,
-            baseline_metrics,
-            aggregate_expressions,
-            accumulators,
-            finished: false,
-        })
-    }
-}
-
-/// TODO: Make this a member function
-fn aggregate_batch(
-    mode: &AggregateMode,
-    batch: &RecordBatch,
-    accumulators: &mut [AccumulatorItem],
-    expressions: &[Vec<Arc<dyn PhysicalExpr>>],
-) -> Result<()> {
-    // 1.1 iterate accumulators and respective expressions together
-    // 1.2 evaluate expressions
-    // 1.3 update / merge accumulators with the expressions' values
-
-    // 1.1
-    accumulators
-        .iter_mut()
-        .zip(expressions)
-        .try_for_each(|(accum, expr)| {
-            // 1.2
-            let values = &expr
-                .iter()
-                .map(|e| e.evaluate(batch))
-                .map(|r| r.map(|v| v.into_array(batch.num_rows())))
-                .collect::<Result<Vec<_>>>()?;
-
-            // 1.3
-            match mode {
-                AggregateMode::Partial => accum.update_batch(values),
-                AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                    accum.merge_batch(values)
-                }
-            }
-        })
-}
-
-impl Stream for HashAggregateStream {
-    type Item = ArrowResult<RecordBatch>;
-
-    fn poll_next(
-        mut self: std::pin::Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Self::Item>> {
-        let this = &mut *self;
-        if this.finished {
-            return Poll::Ready(None);
-        }
-
-        let elapsed_compute = this.baseline_metrics.elapsed_compute();
-
-        loop {
-            let result = match ready!(this.input.poll_next_unpin(cx)) {
-                Some(Ok(batch)) => {
-                    let timer = elapsed_compute.timer();
-                    let result = aggregate_batch(
-                        &this.mode,
-                        &batch,
-                        &mut this.accumulators,
-                        &this.aggregate_expressions,
-                    );
-
-                    timer.done();
-
-                    match result {
-                        Ok(_) => continue,
-                        Err(e) => Err(ArrowError::ExternalError(Box::new(e))),
-                    }
-                }
-                Some(Err(e)) => Err(e),
-                None => {
-                    this.finished = true;
-                    let timer = this.baseline_metrics.elapsed_compute().timer();
-                    let result = finalize_aggregation(&this.accumulators, &this.mode)
-                        .map_err(|e| ArrowError::ExternalError(Box::new(e)))
-                        .and_then(|columns| {
-                            RecordBatch::try_new(this.schema.clone(), columns)
-                        })
-                        .record_output(&this.baseline_metrics);
-
-                    timer.done();
-                    result
-                }
-            };
-
-            this.finished = true;
-            return Poll::Ready(Some(result));
-        }
-    }
-}
-
-impl RecordBatchStream for HashAggregateStream {
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-}
-
-/// Create a RecordBatch with all group keys and accumulator' states or values.
-fn create_batch_from_map(
-    mode: &AggregateMode,
-    accumulators: &Accumulators,
-    num_group_expr: usize,
-    output_schema: &Schema,
-) -> ArrowResult<RecordBatch> {
-    if accumulators.group_states.is_empty() {
-        return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned())));
-    }
-    let accs = &accumulators.group_states[0].accumulator_set;
-    let mut acc_data_types: Vec<usize> = vec![];
-
-    // Calculate number/shape of state arrays
-    match mode {
-        AggregateMode::Partial => {
-            for acc in accs.iter() {
-                let state = acc.state()?;
-                acc_data_types.push(state.len());
-            }
-        }
-        AggregateMode::Final | AggregateMode::FinalPartitioned => {
-            acc_data_types = vec![1; accs.len()];
-        }
-    }
-
-    let mut columns = (0..num_group_expr)
-        .map(|i| {
-            ScalarValue::iter_to_array(
-                accumulators
-                    .group_states
-                    .iter()
-                    .map(|group_state| group_state.group_by_values[i].clone()),
-            )
-        })
-        .collect::<Result<Vec<_>>>()?;
-
-    // add state / evaluated arrays
-    for (x, &state_len) in acc_data_types.iter().enumerate() {
-        for y in 0..state_len {
-            match mode {
-                AggregateMode::Partial => {
-                    let res = ScalarValue::iter_to_array(
-                        accumulators.group_states.iter().map(|group_state| {
-                            let x = group_state.accumulator_set[x].state().unwrap();
-                            x[y].clone()
-                        }),
-                    )?;
-
-                    columns.push(res);
-                }
-                AggregateMode::Final | AggregateMode::FinalPartitioned => {
-                    let res = ScalarValue::iter_to_array(
-                        accumulators.group_states.iter().map(|group_state| {
-                            group_state.accumulator_set[x].evaluate().unwrap()
-                        }),
-                    )?;
-                    columns.push(res);
-                }
-            }
-        }
-    }
-
-    // cast output if needed (e.g. for types like Dictionary where
-    // the intermediate GroupByScalar type was not the same as the
-    // output
-    let columns = columns
-        .iter()
-        .zip(output_schema.fields().iter())
-        .map(|(col, desired_field)| cast(col, desired_field.data_type()))
-        .collect::<ArrowResult<Vec<_>>>()?;
-
-    RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns)
-}
-
-fn create_accumulators(
-    aggr_expr: &[Arc<dyn AggregateExpr>],
-) -> Result<Vec<AccumulatorItem>> {
-    aggr_expr
-        .iter()
-        .map(|expr| expr.create_accumulator())
-        .collect::<Result<Vec<_>>>()
-}
-
-/// returns a vector of ArrayRefs, where each entry corresponds to either the
-/// final value (mode = Final) or states (mode = Partial)
-fn finalize_aggregation(
-    accumulators: &[AccumulatorItem],
-    mode: &AggregateMode,
-) -> Result<Vec<ArrayRef>> {
-    match mode {
-        AggregateMode::Partial => {
-            // build the vector of states
-            let a = accumulators
-                .iter()
-                .map(|accumulator| accumulator.state())
-                .map(|value| {
-                    value.map(|e| {
-                        e.iter().map(|v| v.to_array()).collect::<Vec<ArrayRef>>()
-                    })
-                })
-                .collect::<Result<Vec<_>>>()?;
-            Ok(a.iter().flatten().cloned().collect::<Vec<_>>())
-        }
-        AggregateMode::Final | AggregateMode::FinalPartitioned => {
-            // merge the state to the final value
-            accumulators
-                .iter()
-                .map(|accumulator| accumulator.evaluate().map(|v| v.to_array()))
-                .collect::<Result<Vec<ArrayRef>>>()
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use super::*;
-    use crate::from_slice::FromSlice;
-    use crate::physical_plan::expressions::{col, Avg};
-    use crate::test::assert_is_pending;
-    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
-    use crate::{assert_batches_sorted_eq, physical_plan::common};
-    use arrow::array::{Float64Array, UInt32Array};
-    use arrow::datatypes::DataType;
-    use datafusion_common::DataFusionError;
-    use futures::FutureExt;
-
-    use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec;
-    use crate::prelude::SessionContext;
-
-    /// some mock data to aggregates
-    fn some_data() -> (Arc<Schema>, Vec<RecordBatch>) {
-        // define a schema.
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::UInt32, false),
-            Field::new("b", DataType::Float64, false),
-        ]));
-
-        // define data.
-        (
-            schema.clone(),
-            vec![
-                RecordBatch::try_new(
-                    schema.clone(),
-                    vec![
-                        Arc::new(UInt32Array::from_slice(&[2, 3, 4, 4])),
-                        Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])),
-                    ],
-                )
-                .unwrap(),
-                RecordBatch::try_new(
-                    schema,
-                    vec![
-                        Arc::new(UInt32Array::from_slice(&[2, 3, 3, 4])),
-                        Arc::new(Float64Array::from_slice(&[1.0, 2.0, 3.0, 4.0])),
-                    ],
-                )
-                .unwrap(),
-            ],
-        )
-    }
-
-    /// build the aggregates on the data from some_data() and check the results
-    async fn check_aggregates(input: Arc<dyn ExecutionPlan>) -> Result<()> {
-        let input_schema = input.schema();
-
-        let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
-            vec![(col("a", &input_schema)?, "a".to_string())];
-
-        let aggregates: Vec<Arc<dyn AggregateExpr>> = vec![Arc::new(Avg::new(
-            col("b", &input_schema)?,
-            "AVG(b)".to_string(),
-            DataType::Float64,
-        ))];
-
-        let session_ctx = SessionContext::new();
-        let task_ctx = session_ctx.task_ctx();
-
-        let partial_aggregate = Arc::new(HashAggregateExec::try_new(
-            AggregateMode::Partial,
-            groups.clone(),
-            aggregates.clone(),
-            input,
-            input_schema.clone(),
-        )?);
-
-        let result =
-            common::collect(partial_aggregate.execute(0, task_ctx.clone()).await?)
-                .await?;
-
-        let expected = vec![
-            "+---+---------------+-------------+",
-            "| a | AVG(b)[count] | AVG(b)[sum] |",
-            "+---+---------------+-------------+",
-            "| 2 | 2             | 2           |",
-            "| 3 | 3             | 7           |",
-            "| 4 | 3             | 11          |",
-            "+---+---------------+-------------+",
-        ];
-        assert_batches_sorted_eq!(expected, &result);
-
-        let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate));
-
-        let final_group: Vec<Arc<dyn PhysicalExpr>> = (0..groups.len())
-            .map(|i| col(&groups[i].1, &input_schema))
-            .collect::<Result<_>>()?;
-
-        let merged_aggregate = Arc::new(HashAggregateExec::try_new(
-            AggregateMode::Final,
-            final_group
-                .iter()
-                .enumerate()
-                .map(|(i, expr)| (expr.clone(), groups[i].1.clone()))
-                .collect(),
-            aggregates,
-            merge,
-            input_schema,
-        )?);
-
-        let result =
-            common::collect(merged_aggregate.execute(0, task_ctx.clone()).await?).await?;
-        assert_eq!(result.len(), 1);
-
-        let batch = &result[0];
-        assert_eq!(batch.num_columns(), 2);
-        assert_eq!(batch.num_rows(), 3);
-
-        let expected = vec![
-            "+---+--------------------+",
-            "| a | AVG(b)             |",
-            "+---+--------------------+",
-            "| 2 | 1                  |",
-            "| 3 | 2.3333333333333335 |", // 3, (2 + 3 + 2) / 3
-            "| 4 | 3.6666666666666665 |", // 4, (3 + 4 + 4) / 3
-            "+---+--------------------+",
-        ];
-
-        assert_batches_sorted_eq!(&expected, &result);
-
-        let metrics = merged_aggregate.metrics().unwrap();
-        let output_rows = metrics.output_rows().unwrap();
-        assert_eq!(3, output_rows);
-
-        Ok(())
-    }
-
-    /// Define a test source that can yield back to runtime before returning its first item ///
-
-    #[derive(Debug)]
-    struct TestYieldingExec {
-        /// True if this exec should yield back to runtime the first time it is polled
-        pub yield_first: bool,
-    }
-
-    #[async_trait]
-    impl ExecutionPlan for TestYieldingExec {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-        fn schema(&self) -> SchemaRef {
-            some_data().0
-        }
-
-        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
-            vec![]
-        }
-
-        fn output_partitioning(&self) -> Partitioning {
-            Partitioning::UnknownPartitioning(1)
-        }
-
-        fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
-            None
-        }
-
-        fn with_new_children(
-            self: Arc<Self>,
-            _: Vec<Arc<dyn ExecutionPlan>>,
-        ) -> Result<Arc<dyn ExecutionPlan>> {
-            Err(DataFusionError::Internal(format!(
-                "Children cannot be replaced in {:?}",
-                self
-            )))
-        }
-
-        async fn execute(
-            &self,
-            _partition: usize,
-            _context: Arc<TaskContext>,
-        ) -> Result<SendableRecordBatchStream> {
-            let stream = if self.yield_first {
-                TestYieldingStream::New
-            } else {
-                TestYieldingStream::Yielded
-            };
-
-            Ok(Box::pin(stream))
-        }
-
-        fn statistics(&self) -> Statistics {
-            let (_, batches) = some_data();
-            common::compute_record_batch_statistics(&[batches], &self.schema(), None)
-        }
-    }
-
-    /// A stream using the demo data. If inited as new, it will first yield to runtime before returning records
-    enum TestYieldingStream {
-        New,
-        Yielded,
-        ReturnedBatch1,
-        ReturnedBatch2,
-    }
-
-    impl Stream for TestYieldingStream {
-        type Item = ArrowResult<RecordBatch>;
-
-        fn poll_next(
-            mut self: std::pin::Pin<&mut Self>,
-            cx: &mut Context<'_>,
-        ) -> Poll<Option<Self::Item>> {
-            match &*self {
-                TestYieldingStream::New => {
-                    *(self.as_mut()) = TestYieldingStream::Yielded;
-                    cx.waker().wake_by_ref();
-                    Poll::Pending
-                }
-                TestYieldingStream::Yielded => {
-                    *(self.as_mut()) = TestYieldingStream::ReturnedBatch1;
-                    Poll::Ready(Some(Ok(some_data().1[0].clone())))
-                }
-                TestYieldingStream::ReturnedBatch1 => {
-                    *(self.as_mut()) = TestYieldingStream::ReturnedBatch2;
-                    Poll::Ready(Some(Ok(some_data().1[1].clone())))
-                }
-                TestYieldingStream::ReturnedBatch2 => Poll::Ready(None),
-            }
-        }
-    }
-
-    impl RecordBatchStream for TestYieldingStream {
-        fn schema(&self) -> SchemaRef {
-            some_data().0
-        }
-    }
-
-    //// Tests ////
-
-    #[tokio::test]
-    async fn aggregate_source_not_yielding() -> Result<()> {
-        let input: Arc<dyn ExecutionPlan> =
-            Arc::new(TestYieldingExec { yield_first: false });
-
-        check_aggregates(input).await
-    }
-
-    #[tokio::test]
-    async fn aggregate_source_with_yielding() -> Result<()> {
-        let input: Arc<dyn ExecutionPlan> =
-            Arc::new(TestYieldingExec { yield_first: true });
-
-        check_aggregates(input).await
-    }
-
-    #[tokio::test]
-    async fn test_drop_cancel_without_groups() -> Result<()> {
-        let session_ctx = SessionContext::new();
-        let task_ctx = session_ctx.task_ctx();
-        let schema =
-            Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)]));
-
-        let groups = vec![];
-
-        let aggregates: Vec<Arc<dyn AggregateExpr>> = vec![Arc::new(Avg::new(
-            col("a", &schema)?,
-            "AVG(a)".to_string(),
-            DataType::Float64,
-        ))];
-
-        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
-        let refs = blocking_exec.refs();
-        let hash_aggregate_exec = Arc::new(HashAggregateExec::try_new(
-            AggregateMode::Partial,
-            groups.clone(),
-            aggregates.clone(),
-            blocking_exec,
-            schema,
-        )?);
-
-        let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx);
-        let mut fut = fut.boxed();
-
-        assert_is_pending(&mut fut);
-        drop(fut);
-        assert_strong_count_converges_to_zero(refs).await;
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_drop_cancel_with_groups() -> Result<()> {
-        let session_ctx = SessionContext::new();
-        let task_ctx = session_ctx.task_ctx();
-        let schema = Arc::new(Schema::new(vec![
-            Field::new("a", DataType::Float32, true),
-            Field::new("b", DataType::Float32, true),
-        ]));
-
-        let groups: Vec<(Arc<dyn PhysicalExpr>, String)> =
-            vec![(col("a", &schema)?, "a".to_string())];
-
-        let aggregates: Vec<Arc<dyn AggregateExpr>> = vec![Arc::new(Avg::new(
-            col("b", &schema)?,
-            "AVG(b)".to_string(),
-            DataType::Float64,
-        ))];
-
-        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
-        let refs = blocking_exec.refs();
-        let hash_aggregate_exec = Arc::new(HashAggregateExec::try_new(
-            AggregateMode::Partial,
-            groups.clone(),
-            aggregates.clone(),
-            blocking_exec,
-            schema,
-        )?);
-
-        let fut = crate::physical_plan::collect(hash_aggregate_exec, task_ctx);
-        let mut fut = fut.boxed();
-
-        assert_is_pending(&mut fut);
-        drop(fut);
-        assert_strong_count_converges_to_zero(refs).await;
-
-        Ok(())
-    }
-}
diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs
index b7b25a636efc9..dc963c7e1bdc1 100644
--- a/datafusion/core/src/physical_plan/mod.rs
+++ b/datafusion/core/src/physical_plan/mod.rs
@@ -555,7 +555,6 @@ pub use datafusion_physical_expr::expressions;
 pub mod file_format;
 pub mod filter;
 pub mod functions;
-pub mod hash_aggregate;
 pub mod hash_join;
 pub mod hash_utils;
 pub mod join_utils;
diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs
index 84785777b016c..966b973b37cde 100644
--- a/datafusion/core/src/physical_plan/planner.rs
+++ b/datafusion/core/src/physical_plan/planner.rs
@@ -34,6 +34,7 @@ use crate::logical_plan::{
 };
 use crate::logical_plan::{Limit, Values};
 use crate::physical_optimizer::optimizer::PhysicalOptimizerRule;
+use crate::physical_plan::aggregates::{AggregateExec, AggregateMode};
 use crate::physical_plan::cross_join::CrossJoinExec;
 use crate::physical_plan::explain::ExplainExec;
 use crate::physical_plan::expressions;
@@ -41,7 +42,6 @@ use crate::physical_plan::expressions::{
     CaseExpr, Column, GetIndexedFieldExpr, Literal, PhysicalSortExpr,
 };
 use crate::physical_plan::filter::FilterExec;
-use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
 use crate::physical_plan::hash_join::HashJoinExec;
 use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use crate::physical_plan::projection::ProjectionExec;
@@ -524,7 +524,7 @@ impl DefaultPhysicalPlanner {
                         })
                         .collect::<Result<Vec<_>>>()?;
 
-                    let initial_aggr = Arc::new(HashAggregateExec::try_new(
+                    let initial_aggr = Arc::new(AggregateExec::try_new(
                         AggregateMode::Partial,
                         groups.clone(),
                         aggregates.clone(),
@@ -566,7 +566,7 @@ impl DefaultPhysicalPlanner {
                         (initial_aggr, AggregateMode::Final)
                     };
 
-                    Ok(Arc::new(HashAggregateExec::try_new(
+                    Ok(Arc::new(AggregateExec::try_new(
                         next_partition_mode,
                         final_group
                             .iter()
@@ -1839,7 +1839,7 @@ mod tests {
         let execution_plan = plan(&logical_plan).await?;
         let final_hash_agg = execution_plan
             .as_any()
-            .downcast_ref::<HashAggregateExec>()
+            .downcast_ref::<AggregateExec>()
             .expect("hash aggregate");
         assert_eq!(
             "SUM(aggregate_test_100.c2)",
diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs
index a124311aa4ff5..77287f566a6aa 100644
--- a/datafusion/core/tests/sql/explain_analyze.rs
+++ b/datafusion/core/tests/sql/explain_analyze.rs
@@ -110,7 +110,7 @@ async fn explain_analyze_baseline_metrics() {
         use datafusion::physical_plan::sorts;
 
         plan.as_any().downcast_ref::<sorts::sort::SortExec>().is_some()
-            || plan.as_any().downcast_ref::<physical_plan::hash_aggregate::HashAggregateExec>().is_some()
+            || plan.as_any().downcast_ref::<physical_plan::aggregates::AggregateExec>().is_some()
             // CoalescePartitionsExec doesn't do any work so is not included
             || plan.as_any().downcast_ref::<physical_plan::filter::FilterExec>().is_some()
             || plan.as_any().downcast_ref::<physical_plan::limit::GlobalLimitExec>().is_some()

From fbeaf0b0ded82a72d3c3755eb1366af98f120556 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 29 Apr 2022 12:06:44 +0800
Subject: [PATCH 2/8] basic accumulators

---
 datafusion/core/Cargo.toml                    |   2 +-
 .../core/src/physical_plan/aggregates/hash.rs |  25 +-
 .../core/src/physical_plan/aggregates/mod.rs  |  23 +
 .../src/physical_plan/aggregates/row_hash.rs  | 412 ++++++++++++++++++
 .../core/src/physical_plan/hash_utils.rs      |  35 ++
 datafusion/physical-expr/Cargo.toml           |   4 +-
 .../physical-expr/src/aggregate/average.rs    |  71 +++
 .../physical-expr/src/aggregate/count.rs      |  44 ++
 .../physical-expr/src/aggregate/min_max.rs    | 162 +++++++
 datafusion/physical-expr/src/aggregate/mod.rs |   6 +
 .../src/aggregate/row_accumulator.rs          |  39 ++
 datafusion/physical-expr/src/aggregate/sum.rs | 141 +++++-
 datafusion/row/src/accessor.rs                | 302 +++++++++++++
 datafusion/row/src/layout.rs                  |   8 +-
 datafusion/row/src/lib.rs                     |   1 +
 datafusion/row/src/reader.rs                  |   3 +
 datafusion/row/src/writer.rs                  |   2 +
 17 files changed, 1252 insertions(+), 28 deletions(-)
 create mode 100644 datafusion/core/src/physical_plan/aggregates/row_hash.rs
 create mode 100644 datafusion/physical-expr/src/aggregate/row_accumulator.rs
 create mode 100644 datafusion/row/src/accessor.rs

diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 533b38b81c7d7..37361cbb507f2 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -41,7 +41,7 @@ path = "src/lib.rs"
 # Used to enable the avro format
 avro = ["avro-rs", "num-traits", "datafusion-common/avro"]
 crypto_expressions = ["datafusion-physical-expr/crypto_expressions"]
-default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
+default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
 force_hash_collisions = []
 # Used to enable JIT code generation
diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs
index 85e82f14c55d5..c0fcd5413f72e 100644
--- a/datafusion/core/src/physical_plan/aggregates/hash.rs
+++ b/datafusion/core/src/physical_plan/aggregates/hash.rs
@@ -28,7 +28,9 @@ use futures::{
 };
 
 use crate::error::Result;
-use crate::physical_plan::aggregates::{AccumulatorItem, AggregateMode};
+use crate::physical_plan::aggregates::{
+    evaluate, evaluate_many, AccumulatorItem, AggregateMode,
+};
 use crate::physical_plan::hash_utils::create_hashes;
 use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
 use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr};
@@ -380,27 +382,6 @@ impl std::fmt::Debug for Accumulators {
     }
 }
 
-/// Evaluates expressions against a record batch.
-fn evaluate(
-    expr: &[Arc<dyn PhysicalExpr>],
-    batch: &RecordBatch,
-) -> Result<Vec<ArrayRef>> {
-    expr.iter()
-        .map(|expr| expr.evaluate(batch))
-        .map(|r| r.map(|v| v.into_array(batch.num_rows())))
-        .collect::<Result<Vec<_>>>()
-}
-
-/// Evaluates expressions against a record batch.
-fn evaluate_many(
-    expr: &[Vec<Arc<dyn PhysicalExpr>>],
-    batch: &RecordBatch,
-) -> Result<Vec<Vec<ArrayRef>>> {
-    expr.iter()
-        .map(|expr| evaluate(expr, batch))
-        .collect::<Result<Vec<_>>>()
-}
-
 /// Create a RecordBatch with all group keys and accumulator' states or values.
 fn create_batch_from_map(
     mode: &AggregateMode,
diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs
index af7df3dccfc71..d2cb486dba3ca 100644
--- a/datafusion/core/src/physical_plan/aggregates/mod.rs
+++ b/datafusion/core/src/physical_plan/aggregates/mod.rs
@@ -29,6 +29,7 @@ use crate::physical_plan::{
 };
 use arrow::array::ArrayRef;
 use arrow::datatypes::{Field, Schema, SchemaRef};
+use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion_common::Result;
 use datafusion_expr::Accumulator;
@@ -41,6 +42,7 @@ use std::sync::Arc;
 
 mod hash;
 mod no_grouping;
+mod row_hash;
 
 pub use datafusion_expr::AggregateFunction;
 pub use datafusion_physical_expr::expressions::create_aggregate_expr;
@@ -404,6 +406,27 @@ fn finalize_aggregation(
     }
 }
 
+/// Evaluates expressions against a record batch.
+fn evaluate(
+    expr: &[Arc<dyn PhysicalExpr>],
+    batch: &RecordBatch,
+) -> Result<Vec<ArrayRef>> {
+    expr.iter()
+        .map(|expr| expr.evaluate(batch))
+        .map(|r| r.map(|v| v.into_array(batch.num_rows())))
+        .collect::<Result<Vec<_>>>()
+}
+
+/// Evaluates expressions against a record batch.
+fn evaluate_many(
+    expr: &[Vec<Arc<dyn PhysicalExpr>>],
+    batch: &RecordBatch,
+) -> Result<Vec<Vec<ArrayRef>>> {
+    expr.iter()
+        .map(|expr| evaluate(expr, batch))
+        .collect::<Result<Vec<_>>>()
+}
+
 #[cfg(test)]
 mod tests {
     use crate::execution::context::TaskContext;
diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
new file mode 100644
index 0000000000000..251adb23790b4
--- /dev/null
+++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
@@ -0,0 +1,412 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Hash aggregation through row format
+
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use std::vec;
+
+use ahash::RandomState;
+use futures::{
+    ready,
+    stream::{Stream, StreamExt},
+};
+
+use crate::error::Result;
+use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode};
+use crate::physical_plan::hash_utils::create_row_hashes;
+use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
+use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr};
+use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream};
+
+use arrow::datatypes::Schema;
+use arrow::{array::ArrayRef, compute};
+use arrow::{
+    array::{Array, UInt32Builder},
+    error::{ArrowError, Result as ArrowResult},
+};
+use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
+use datafusion_row::layout::RowLayout;
+use datafusion_row::writer::{write_row, RowWriter};
+use datafusion_row::RowType;
+use hashbrown::raw::RawTable;
+
+/*
+The architecture is the following:
+
+1. An accumulator has state that is updated on each batch.
+2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row
+3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch.
+4. The state's RecordBatch is `merge`d to a new state
+5. The state is mapped to the final value
+
+Why:
+
+* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array`
+* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge
+* It uses Arrow's native dynamically typed object, `Array`.
+* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant.
+
+Example: average
+
+* the state is `n: u32` and `sum: f64`
+* For every batch, we update them accordingly.
+* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]`
+* The RecordBatch is (sent back / transmitted over network)
+* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns.
+* Finally, `get_value` returns an array with one entry computed from the state
+*/
+pub(crate) struct GroupedRowHashAggregateStream {
+    schema: SchemaRef,
+    input: SendableRecordBatchStream,
+    mode: AggregateMode,
+    accumulators: Accumulators,
+    aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
+
+    aggr_expr: Vec<Arc<dyn AggregateExpr>>,
+    group_expr: Vec<Arc<dyn PhysicalExpr>>,
+
+    group_schema: SchemaRef,
+    aggr_schema: SchemaRef,
+    aggr_layout: RowLayout,
+    aggr_buffer_width: usize,
+
+    baseline_metrics: BaselineMetrics,
+    random_state: RandomState,
+    finished: bool,
+}
+
+fn create_separate_schema(schema: &Schema, group_count: usize) -> (SchemaRef, SchemaRef) {
+    let (group_fields, aggr_fields) = schema.fields().split_at(group_count);
+    (
+        Arc::new(Schema::new(group_fields.to_vec())),
+        Arc::new(Schema::new(aggr_fields.to_vec())),
+    )
+}
+
+impl GroupedRowHashAggregateStream {
+    /// Create a new GroupedRowHashAggregateStream
+    pub fn new(
+        mode: AggregateMode,
+        schema: SchemaRef,
+        group_expr: Vec<Arc<dyn PhysicalExpr>>,
+        aggr_expr: Vec<Arc<dyn AggregateExpr>>,
+        input: SendableRecordBatchStream,
+        baseline_metrics: BaselineMetrics,
+    ) -> Result<Self> {
+        let timer = baseline_metrics.elapsed_compute().timer();
+
+        // The expressions to evaluate the batch, one vec of expressions per aggregation.
+        // Assume create_schema() always put group columns in front of aggr columns, we set
+        // col_idx_base to group expression count.
+        let aggregate_expressions =
+            aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?;
+
+        let (group_schema, aggr_schema) =
+            create_separate_schema(&schema, group_expr.len());
+        let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned);
+        let aggr_buffer_width = aggr_layout.fixed_part_width();
+        timer.done();
+
+        Ok(Self {
+            schema,
+            mode,
+            input,
+            aggr_expr,
+            group_expr,
+            group_schema,
+            aggr_schema,
+            aggr_layout,
+            aggr_buffer_width,
+            baseline_metrics,
+            aggregate_expressions,
+            accumulators: Default::default(),
+            random_state: Default::default(),
+            finished: false,
+        })
+    }
+}
+
+impl Stream for GroupedRowHashAggregateStream {
+    type Item = ArrowResult<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let this = &mut *self;
+        if this.finished {
+            return Poll::Ready(None);
+        }
+
+        let elapsed_compute = this.baseline_metrics.elapsed_compute();
+
+        loop {
+            let result = match ready!(this.input.poll_next_unpin(cx)) {
+                Some(Ok(batch)) => {
+                    let timer = elapsed_compute.timer();
+                    let result = group_aggregate_batch(
+                        &this.mode,
+                        &this.random_state,
+                        &this.group_expr,
+                        &this.aggr_expr,
+                        &this.group_schema,
+                        &this.aggr_schema,
+                        &this.aggr_layout,
+                        this.aggr_buffer_width,
+                        batch,
+                        &mut this.accumulators,
+                        &this.aggregate_expressions,
+                    );
+
+                    timer.done();
+
+                    match result {
+                        Ok(_) => continue,
+                        Err(e) => Err(ArrowError::ExternalError(Box::new(e))),
+                    }
+                }
+                Some(Err(e)) => Err(e),
+                None => {
+                    this.finished = true;
+                    let timer = this.baseline_metrics.elapsed_compute().timer();
+                    let result = create_batch_from_map(
+                        &this.mode,
+                        &this.accumulators,
+                        this.group_expr.len(),
+                        &this.schema,
+                    )
+                    .record_output(&this.baseline_metrics);
+
+                    timer.done();
+                    result
+                }
+            };
+
+            this.finished = true;
+            return Poll::Ready(Some(result));
+        }
+    }
+}
+
+impl RecordBatchStream for GroupedRowHashAggregateStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+/// TODO: Make this a member function of [`GroupedRowHashAggregateStream`]
+fn group_aggregate_batch(
+    mode: &AggregateMode,
+    random_state: &RandomState,
+    group_expr: &[Arc<dyn PhysicalExpr>],
+    aggr_expr: &[Arc<dyn AggregateExpr>],
+    group_schema: &Schema,
+    aggr_schema: &Schema,
+    aggr_row_layout: &RowLayout,
+    aggr_buffer_width: usize,
+    batch: RecordBatch,
+    accumulators: &mut Accumulators,
+    aggregate_expressions: &[Vec<Arc<dyn PhysicalExpr>>],
+) -> Result<()> {
+    // evaluate the grouping expressions
+    let group_values = evaluate(group_expr, &batch)?;
+    let group_rows: Vec<Vec<u8>> = create_group_rows(group_values, group_schema);
+
+    // evaluate the aggregation expressions.
+    // We could evaluate them after the `take`, but since we need to evaluate all
+    // of them anyways, it is more performant to do it while they are together.
+    let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?;
+
+    // 1.1 construct the key from the group values
+    // 1.2 construct the mapping key if it does not exist
+    // 1.3 add the row' index to `indices`
+
+    // track which entries in `accumulators` have rows in this batch to aggregate
+    let mut groups_with_rows = vec![];
+
+    // 1.1 Calculate the group keys for the group values
+    let mut batch_hashes = vec![0; batch.num_rows()];
+    create_row_hashes(&group_rows, random_state, &mut batch_hashes)?;
+
+    for (row, hash) in batch_hashes.into_iter().enumerate() {
+        let Accumulators { map, group_states } = accumulators;
+
+        let entry = map.get_mut(hash, |(_hash, group_idx)| {
+            // verify that a group that we are inserting with hash is
+            // actually the same key value as the group in
+            // existing_idx  (aka group_values @ row)
+            let group_state = &group_states[*group_idx];
+            group_rows[row] == group_state.group_by_values
+        });
+
+        match entry {
+            // Existing entry for this group value
+            Some((_hash, group_idx)) => {
+                let group_state = &mut group_states[*group_idx];
+                // 1.3
+                if group_state.indices.is_empty() {
+                    groups_with_rows.push(*group_idx);
+                };
+                group_state.indices.push(row as u32); // remember this row
+            }
+            //  1.2 Need to create new entry
+            None => {
+                // Add new entry to group_states and save newly created index
+                let group_state = RowGroupState {
+                    group_by_values: group_rows[row].clone(),
+                    aggregation_buffer: Vec::with_capacity(aggr_buffer_width),
+                    indices: vec![row as u32], // 1.3
+                };
+                let group_idx = group_states.len();
+                group_states.push(group_state);
+                groups_with_rows.push(group_idx);
+
+                // for hasher function, use precomputed hash value
+                map.insert(hash, (hash, group_idx), |(hash, _group_idx)| *hash);
+            }
+        };
+    }
+
+    // Collect all indices + offsets based on keys in this vec
+    let mut batch_indices: UInt32Builder = UInt32Builder::new(0);
+    let mut offsets = vec![0];
+    let mut offset_so_far = 0;
+    for group_idx in groups_with_rows.iter() {
+        let indices = &accumulators.group_states[*group_idx].indices;
+        batch_indices.append_slice(indices)?;
+        offset_so_far += indices.len();
+        offsets.push(offset_so_far);
+    }
+    let batch_indices = batch_indices.finish();
+
+    // `Take` all values based on indices into Arrays
+    let values: Vec<Vec<Arc<dyn Array>>> = aggr_input_values
+        .iter()
+        .map(|array| {
+            array
+                .iter()
+                .map(|array| {
+                    compute::take(
+                        array.as_ref(),
+                        &batch_indices,
+                        None, // None: no index check
+                    )
+                    .unwrap()
+                })
+                .collect()
+            // 2.3
+        })
+        .collect();
+
+    // 2.1 for each key in this batch
+    // 2.2 for each aggregation
+    // 2.3 `slice` from each of its arrays the keys' values
+    // 2.4 update / merge the accumulator with the values
+    // 2.5 clear indices
+    groups_with_rows
+        .iter()
+        .zip(offsets.windows(2))
+        .try_for_each(|(group_idx, offsets)| {
+            let group_state = &mut accumulators.group_states[*group_idx];
+            // 2.2
+            group_state
+                .accumulator_set
+                .iter_mut()
+                .zip(values.iter())
+                .map(|(accumulator, aggr_array)| {
+                    (
+                        accumulator,
+                        aggr_array
+                            .iter()
+                            .map(|array| {
+                                // 2.3
+                                array.slice(offsets[0], offsets[1] - offsets[0])
+                            })
+                            .collect::<Vec<ArrayRef>>(),
+                    )
+                })
+                .try_for_each(|(accumulator, values)| match mode {
+                    AggregateMode::Partial => accumulator.update_batch(&values),
+                    AggregateMode::FinalPartitioned | AggregateMode::Final => {
+                        // note: the aggregation here is over states, not values, thus the merge
+                        accumulator.merge_batch(&values)
+                    }
+                })
+                // 2.5
+                .and({
+                    group_state.indices.clear();
+                    Ok(())
+                })
+        })?;
+
+    Ok(())
+}
+
+/// The state that is built for each output group.
+#[derive(Debug)]
+struct RowGroupState {
+    /// The actual group by values, stored sequentially
+    group_by_values: Vec<u8>,
+
+    // Accumulator state, stored sequentially
+    aggregation_buffer: Vec<u8>,
+
+    /// scratch space used to collect indices for input rows in a
+    /// bach that have values to aggregate. Reset on each batch
+    indices: Vec<u32>,
+}
+
+/// The state of all the groups
+#[derive(Default)]
+struct Accumulators {
+    /// Logically maps group values to an index in `group_states`
+    ///
+    /// Uses the raw API of hashbrown to avoid actually storing the
+    /// keys in the table
+    ///
+    /// keys: u64 hashes of the GroupValue
+    /// values: (hash, index into `group_states`)
+    map: RawTable<(u64, usize)>,
+
+    /// State for each group
+    group_states: Vec<RowGroupState>,
+}
+
+impl std::fmt::Debug for Accumulators {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        // hashes are not store inline, so could only get values
+        let map_string = "RawTable";
+        f.debug_struct("RowAccumulators")
+            .field("map", &map_string)
+            .field("row_group_states", &self.group_states)
+            .finish()
+    }
+}
+
+/// Create grouping rows
+fn create_group_rows(arrays: Vec<ArrayRef>, schema: &Schema) -> Vec<Vec<u8>> {
+    let mut writer = RowWriter::new(schema, RowType::Compact);
+    let mut results = vec![];
+    for cur_row in 0..arrays[0].len() {
+        write_row(&mut writer, cur_row, schema, &arrays);
+        results.push(writer.get_row().to_vec());
+        writer.reset()
+    }
+    results
+}
diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs
index 4e503b19e7bf3..3c0207a863cf7 100644
--- a/datafusion/core/src/physical_plan/hash_utils.rs
+++ b/datafusion/core/src/physical_plan/hash_utils.rs
@@ -268,6 +268,41 @@ pub fn create_hashes<'a>(
     return Ok(hashes_buffer);
 }
 
+/// Test version of `create_row_hashes` that produces the same value for
+/// all hashes (to test collisions)
+///
+/// See comments on `hashes_buffer` for more details
+#[cfg(feature = "force_hash_collisions")]
+pub fn create_row_hashes<'a>(
+    _rows: &[Vec<u8>],
+    _random_state: &RandomState,
+    hashes_buffer: &'a mut Vec<u64>,
+) -> Result<&'a mut Vec<u64>> {
+    for hash in hashes_buffer.iter_mut() {
+        *hash = 0
+    }
+    return Ok(hashes_buffer);
+}
+
+/// Test version of `create_row_hashes` that produces the same value for
+/// all hashes (to test collisions)
+///
+/// See comments on `hashes_buffer` for more details
+#[cfg(not(feature = "force_hash_collisions"))]
+pub fn create_row_hashes<'a>(
+    rows: &[Vec<u8>],
+    random_state: &RandomState,
+    hashes_buffer: &'a mut Vec<u64>,
+) -> Result<&'a mut Vec<u64>> {
+    for hash in hashes_buffer.iter_mut() {
+        *hash = 0
+    }
+    for (i, hash) in hashes_buffer.iter_mut().enumerate() {
+        *hash = <Vec<u8>>::get_hash(&rows[i], random_state);
+    }
+    return Ok(hashes_buffer);
+}
+
 /// Creates hash values for every row, based on the values in the
 /// columns.
 ///
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index 2ae5fa3198408..3df9e984b3a40 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -34,8 +34,9 @@ path = "src/lib.rs"
 
 [features]
 crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
-default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
+default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"]
 regex_expressions = ["regex"]
+row = ["datafusion-row"]
 unicode_expressions = ["unicode-segmentation"]
 
 [dependencies]
@@ -46,6 +47,7 @@ blake3 = { version = "1.0", optional = true }
 chrono = { version = "0.4", default-features = false }
 datafusion-common = { path = "../common", version = "7.0.0" }
 datafusion-expr = { path = "../expr", version = "7.0.0" }
+datafusion-row = { path = "../row", version = "7.0.0", optional = true }
 hashbrown = { version = "0.12", features = ["raw"] }
 lazy_static = { version = "^1.4.0" }
 md-5 = { version = "^0.10.0", optional = true }
diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs
index 637a7f99d35ca..a917af3993732 100644
--- a/datafusion/physical-expr/src/aggregate/average.rs
+++ b/datafusion/physical-expr/src/aggregate/average.rs
@@ -21,6 +21,7 @@ use std::any::Any;
 use std::convert::TryFrom;
 use std::sync::Arc;
 
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::aggregate::sum;
 use crate::expressions::format_state_name;
 use crate::{AggregateExpr, PhysicalExpr};
@@ -33,6 +34,7 @@ use arrow::{
 use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::Accumulator;
+use datafusion_row::accessor::RowAccessor;
 
 /// AVG aggregate expression
 #[derive(Debug)]
@@ -101,6 +103,22 @@ impl AggregateExpr for Avg {
     fn name(&self) -> &str {
         &self.name
     }
+
+    fn row_state_supported(&self) -> bool {
+        matches!(
+            self.data_type,
+            DataType::UInt8
+                | DataType::UInt16
+                | DataType::UInt32
+                | DataType::UInt64
+                | DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::Float32
+                | DataType::Float64
+        )
+    }
 }
 
 /// An accumulator to compute the average
@@ -167,6 +185,59 @@ impl Accumulator for AvgAccumulator {
     }
 }
 
+#[derive(Debug)]
+struct AvgRowAccumulator {
+    start_index: usize,
+    sum_datatype: DataType,
+}
+
+impl AvgRowAccumulator {
+    pub fn new(start_index: usize, sum_datatype: DataType) -> Self {
+        Self {
+            start_index,
+            sum_datatype,
+        }
+    }
+}
+
+impl RowAccumulator for AvgRowAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        let values = &values[0];
+
+        let delta = (values.len() - values.data().null_count()) as u64;
+        accessor.add_u64(self.start_index, delta);
+        sum::add_to_row(
+            &self.sum_datatype,
+            self.start_index + 1,
+            accessor,
+            &sum::sum_batch(values)?,
+        )?;
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        states: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        let counts = states[0].as_any().downcast_ref::<UInt64Array>().unwrap();
+        let delta = compute::sum(counts).unwrap_or(0);
+        accessor.add_u64(self.start_index, delta);
+
+        sum::add_to_row(
+            &self.sum_datatype,
+            self.start_index + 1,
+            accessor,
+            &sum::sum_batch(&states[1])?,
+        )?;
+        Ok(())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs
index 9e8485e928c25..b3210900fca6c 100644
--- a/datafusion/physical-expr/src/aggregate/count.rs
+++ b/datafusion/physical-expr/src/aggregate/count.rs
@@ -18,8 +18,10 @@
 //! Defines physical expressions that can evaluated at runtime during query execution
 
 use std::any::Any;
+use std::fmt::Debug;
 use std::sync::Arc;
 
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::{AggregateExpr, PhysicalExpr};
 use arrow::compute;
 use arrow::datatypes::DataType;
@@ -30,6 +32,7 @@ use arrow::{
 use datafusion_common::Result;
 use datafusion_common::ScalarValue;
 use datafusion_expr::Accumulator;
+use datafusion_row::accessor::RowAccessor;
 
 use crate::expressions::format_state_name;
 
@@ -92,6 +95,10 @@ impl AggregateExpr for Count {
     fn name(&self) -> &str {
         &self.name
     }
+
+    fn row_state_supported(&self) -> bool {
+        true
+    }
 }
 
 #[derive(Debug)]
@@ -131,6 +138,43 @@ impl Accumulator for CountAccumulator {
     }
 }
 
+#[derive(Debug)]
+struct CountRowAccumulator {
+    index: usize,
+}
+
+impl CountRowAccumulator {
+    pub fn new(index: usize) -> Self {
+        Self { index }
+    }
+}
+
+impl RowAccumulator for CountRowAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        let array = &values[0];
+        let delta = (array.len() - array.data().null_count()) as u64;
+        accessor.add_u64(self.index, delta);
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        states: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        let counts = states[0].as_any().downcast_ref::<UInt64Array>().unwrap();
+        let delta = &compute::sum(counts);
+        if let Some(d) = delta {
+            accessor.add_u64(self.index, *d);
+        }
+        Ok(())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs
index 7de10e4b8a7e3..55ab77cbd647c 100644
--- a/datafusion/physical-expr/src/aggregate/min_max.rs
+++ b/datafusion/physical-expr/src/aggregate/min_max.rs
@@ -37,9 +37,11 @@ use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::Accumulator;
 
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::expressions::format_state_name;
 use arrow::array::Array;
 use arrow::array::DecimalArray;
+use datafusion_row::accessor::RowAccessor;
 
 // Min/max aggregation can take Dictionary encode input but always produces unpacked
 // (aka non Dictionary) output. We need to adjust the output data type to reflect this.
@@ -111,6 +113,22 @@ impl AggregateExpr for Max {
     fn name(&self) -> &str {
         &self.name
     }
+
+    fn row_state_supported(&self) -> bool {
+        matches!(
+            self.data_type,
+            DataType::UInt8
+                | DataType::UInt16
+                | DataType::UInt32
+                | DataType::UInt64
+                | DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::Float32
+                | DataType::Float64
+        )
+    }
 }
 
 // Statically-typed version of min/max(array) -> ScalarValue for string types.
@@ -303,6 +321,18 @@ macro_rules! typed_min_max {
     }};
 }
 
+// min/max of two non-string scalar values.
+macro_rules! typed_min_max_row {
+    ($INDEX:ident, $ACC:ident, $SCALAR:expr, $TYPE:ident, $OP:ident) => {{
+        paste::item! {
+            match $SCALAR {
+                None => {}
+                Some(v) => $ACC.[<$OP _ $TYPE>]($INDEX, *v as $TYPE)
+            }
+        }
+    }};
+}
+
 // min/max of two scalar string values.
 macro_rules! typed_min_max_string {
     ($VALUE:expr, $DELTA:expr, $SCALAR:ident, $OP:ident) => {{
@@ -408,16 +438,68 @@ macro_rules! min_max {
     }};
 }
 
+// min/max of two scalar values of the same type
+macro_rules! min_max_row {
+    ($INDEX:ident, $ACC:ident, $SCALAR:expr, $OP:ident) => {{
+        Ok(match $SCALAR {
+            ScalarValue::Float64(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, f64, $OP)
+            }
+            ScalarValue::Float32(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, f32, $OP)
+            }
+            ScalarValue::UInt64(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, u64, $OP)
+            }
+            ScalarValue::UInt32(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, u32, $OP)
+            }
+            ScalarValue::UInt16(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, u16, $OP)
+            }
+            ScalarValue::UInt8(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, u8, $OP)
+            }
+            ScalarValue::Int64(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, i64, $OP)
+            }
+            ScalarValue::Int32(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, i32, $OP)
+            }
+            ScalarValue::Int16(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, i16, $OP)
+            }
+            ScalarValue::Int8(rhs) => {
+                typed_min_max_row!($INDEX, $ACC, rhs, i8, $OP)
+            }
+            e => {
+                return Err(DataFusionError::Internal(format!(
+                    "MIN/MAX is not expected to receive scalars of incompatible types {:?}",
+                    e
+                )))
+            }
+        })
+    }};
+}
+
 /// the minimum of two scalar values
 pub fn min(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> {
     min_max!(lhs, rhs, min)
 }
 
+pub fn min_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> {
+    min_max_row!(index, accessor, s, min)
+}
+
 /// the maximum of two scalar values
 pub fn max(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> {
     min_max!(lhs, rhs, max)
 }
 
+pub fn max_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> {
+    min_max_row!(index, accessor, s, max)
+}
+
 /// An accumulator to compute the maximum value
 #[derive(Debug)]
 pub struct MaxAccumulator {
@@ -454,6 +536,38 @@ impl Accumulator for MaxAccumulator {
     }
 }
 
+#[derive(Debug)]
+struct MaxRowAccumulator {
+    index: usize,
+}
+
+impl MaxRowAccumulator {
+    pub fn new(index: usize) -> Self {
+        Self { index }
+    }
+}
+
+impl RowAccumulator for MaxRowAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        let values = &values[0];
+        let delta = &max_batch(values)?;
+        max_row(self.index, accessor, delta)?;
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        states: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        self.update_batch(states, accessor)
+    }
+}
+
 /// MIN aggregate expression
 #[derive(Debug)]
 pub struct Min {
@@ -512,6 +626,22 @@ impl AggregateExpr for Min {
     fn name(&self) -> &str {
         &self.name
     }
+
+    fn row_state_supported(&self) -> bool {
+        matches!(
+            self.data_type,
+            DataType::UInt8
+                | DataType::UInt16
+                | DataType::UInt32
+                | DataType::UInt64
+                | DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::Float32
+                | DataType::Float64
+        )
+    }
 }
 
 /// An accumulator to compute the minimum value
@@ -550,6 +680,38 @@ impl Accumulator for MinAccumulator {
     }
 }
 
+#[derive(Debug)]
+struct MinRowAccumulator {
+    index: usize,
+}
+
+impl MinRowAccumulator {
+    pub fn new(index: usize) -> Self {
+        Self { index }
+    }
+}
+
+impl RowAccumulator for MinRowAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        let values = &values[0];
+        let delta = &min_batch(values)?;
+        min_row(self.index, accessor, delta)?;
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        states: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        self.update_batch(states, accessor)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs
index 019a60cd57607..13bef9018e5a9 100644
--- a/datafusion/physical-expr/src/aggregate/mod.rs
+++ b/datafusion/physical-expr/src/aggregate/mod.rs
@@ -38,6 +38,7 @@ pub(crate) mod distinct_expressions;
 pub(crate) mod min_max;
 pub mod build_in;
 mod hyperloglog;
+pub mod row_accumulator;
 pub(crate) mod stats;
 pub(crate) mod stddev;
 pub(crate) mod sum;
@@ -75,4 +76,9 @@ pub trait AggregateExpr: Send + Sync + Debug {
     fn name(&self) -> &str {
         "AggregateExpr: default name"
     }
+
+    /// If the aggregate expression is supported by row format
+    fn row_state_supported(&self) -> bool {
+        false
+    }
 }
diff --git a/datafusion/physical-expr/src/aggregate/row_accumulator.rs b/datafusion/physical-expr/src/aggregate/row_accumulator.rs
new file mode 100644
index 0000000000000..a0024f2eb3f22
--- /dev/null
+++ b/datafusion/physical-expr/src/aggregate/row_accumulator.rs
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Accumulator in raw format
+
+use arrow::array::ArrayRef;
+use datafusion_common::Result;
+use datafusion_row::accessor::RowAccessor;
+use std::fmt::Debug;
+
+pub trait RowAccumulator: Send + Sync + Debug {
+    /// updates the accumulator's state from a vector of arrays.
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()>;
+
+    /// updates the accumulator's state from a vector of states.
+    fn merge_batch(
+        &mut self,
+        states: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()>;
+}
diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs
index 12572f9a9324a..82c34df3786f5 100644
--- a/datafusion/physical-expr/src/aggregate/sum.rs
+++ b/datafusion/physical-expr/src/aggregate/sum.rs
@@ -34,9 +34,11 @@ use arrow::{
 use datafusion_common::{DataFusionError, Result, ScalarValue};
 use datafusion_expr::Accumulator;
 
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::expressions::format_state_name;
 use arrow::array::Array;
 use arrow::array::DecimalArray;
+use datafusion_row::accessor::RowAccessor;
 
 /// SUM aggregate expression
 #[derive(Debug)]
@@ -96,6 +98,22 @@ impl AggregateExpr for Sum {
     fn name(&self) -> &str {
         &self.name
     }
+
+    fn row_state_supported(&self) -> bool {
+        matches!(
+            self.data_type,
+            DataType::UInt8
+                | DataType::UInt16
+                | DataType::UInt32
+                | DataType::UInt64
+                | DataType::Int8
+                | DataType::Int16
+                | DataType::Int32
+                | DataType::Int64
+                | DataType::Float32
+                | DataType::Float64
+        )
+    }
 }
 
 #[derive(Debug)]
@@ -180,6 +198,17 @@ macro_rules! typed_sum {
     }};
 }
 
+macro_rules! sum_row {
+    ($INDEX:ident, $ACC:ident, $DELTA:expr, $TYPE:ident) => {{
+        paste::item! {
+            match $DELTA {
+                None => {}
+                Some(v) => $ACC.[<add_ $TYPE>]($INDEX, *v as $TYPE)
+            }
+        }
+    }};
+}
+
 // TODO implement this in arrow-rs with simd
 // https://github.com/apache/arrow-rs/issues/1010
 fn sum_decimal(
@@ -284,7 +313,7 @@ pub(crate) fn sum(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> {
         (ScalarValue::UInt64(lhs), ScalarValue::UInt8(rhs)) => {
             typed_sum!(lhs, rhs, UInt64, u64)
         }
-        // i64 coerces i* to u64
+        // i64 coerces i* to i64
         (ScalarValue::Int64(lhs), ScalarValue::Int64(rhs)) => {
             typed_sum!(lhs, rhs, Int64, i64)
         }
@@ -306,6 +335,84 @@ pub(crate) fn sum(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> {
     })
 }
 
+pub(crate) fn add_to_row(
+    dt: &DataType,
+    index: usize,
+    accessor: &mut RowAccessor,
+    s: &ScalarValue,
+) -> Result<()> {
+    match (dt, s) {
+        // float64 coerces everything to f64
+        (DataType::Float64, ScalarValue::Float64(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::Float32(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::Int64(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::Int32(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::Int16(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::Int8(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::UInt64(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::UInt32(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::UInt16(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        (DataType::Float64, ScalarValue::UInt8(rhs)) => {
+            sum_row!(index, accessor, rhs, f64)
+        }
+        // float32 has no cast
+        (DataType::Float32, ScalarValue::Float32(rhs)) => {
+            sum_row!(index, accessor, rhs, f32)
+        }
+        // u64 coerces u* to u64
+        (DataType::UInt64, ScalarValue::UInt64(rhs)) => {
+            sum_row!(index, accessor, rhs, u64)
+        }
+        (DataType::UInt64, ScalarValue::UInt32(rhs)) => {
+            sum_row!(index, accessor, rhs, u64)
+        }
+        (DataType::UInt64, ScalarValue::UInt16(rhs)) => {
+            sum_row!(index, accessor, rhs, u64)
+        }
+        (DataType::UInt64, ScalarValue::UInt8(rhs)) => {
+            sum_row!(index, accessor, rhs, u64)
+        }
+        // i64 coerces i* to i64
+        (DataType::Int64, ScalarValue::Int64(rhs)) => {
+            sum_row!(index, accessor, rhs, i64)
+        }
+        (DataType::Int64, ScalarValue::Int32(rhs)) => {
+            sum_row!(index, accessor, rhs, i64)
+        }
+        (DataType::Int64, ScalarValue::Int16(rhs)) => {
+            sum_row!(index, accessor, rhs, i64)
+        }
+        (DataType::Int64, ScalarValue::Int8(rhs)) => {
+            sum_row!(index, accessor, rhs, i64)
+        }
+        e => {
+            return Err(DataFusionError::Internal(format!(
+                "Row sum updater is not expected to receive a scalar {:?}",
+                e
+            )));
+        }
+    }
+    Ok(())
+}
+
 impl Accumulator for SumAccumulator {
     fn state(&self) -> Result<Vec<ScalarValue>> {
         Ok(vec![self.sum.clone()])
@@ -329,6 +436,38 @@ impl Accumulator for SumAccumulator {
     }
 }
 
+#[derive(Debug)]
+struct SumRowAccumulator {
+    index: usize,
+    datatype: DataType,
+}
+
+impl SumRowAccumulator {
+    pub fn new(index: usize, datatype: DataType) -> Self {
+        Self { index, datatype }
+    }
+}
+
+impl RowAccumulator for SumRowAccumulator {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        let values = &values[0];
+        add_to_row(&self.datatype, self.index, accessor, &sum_batch(values)?)?;
+        Ok(())
+    }
+
+    fn merge_batch(
+        &mut self,
+        states: &[ArrayRef],
+        accessor: &mut RowAccessor,
+    ) -> Result<()> {
+        self.update_batch(states, accessor)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/datafusion/row/src/accessor.rs b/datafusion/row/src/accessor.rs
new file mode 100644
index 0000000000000..7b204fe85b7c2
--- /dev/null
+++ b/datafusion/row/src/accessor.rs
@@ -0,0 +1,302 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Setter/Getter for row with all fixed-sized fields.
+
+use crate::layout::{RowLayout, RowType};
+use crate::validity::{all_valid, NullBitsFormatter};
+use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx, get_idx, set_idx};
+use arrow::datatypes::Schema;
+use arrow::util::bit_util::{get_bit_raw, set_bit_raw, unset_bit_raw};
+
+//TODO: DRY with reader and writer
+
+/// Read the tuple `data[base_offset..]` we are currently pointing to
+pub struct RowAccessor<'a> {
+    /// Layout on how to read each field
+    layout: RowLayout,
+    /// Raw bytes slice where the tuple stores
+    data: &'a mut [u8],
+    /// Start position for the current tuple in the raw bytes slice.
+    base_offset: usize,
+}
+
+impl<'a> std::fmt::Debug for RowAccessor<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.null_free() {
+            write!(f, "null_free")
+        } else {
+            let null_bits = self.null_bits();
+            write!(
+                f,
+                "{:?}",
+                NullBitsFormatter::new(null_bits, self.layout.field_count)
+            )
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! fn_add_idx {
+    ($NATIVE: ident) => {
+        paste::item! {
+            /// add field at `idx` with `value`
+            pub fn [<add_ $NATIVE>](&mut self, idx: usize, value: $NATIVE) {
+                if self.is_valid_at(idx) {
+                    self.[<set_ $NATIVE>](idx, value + self.[<get_ $NATIVE>](idx));
+                } else {
+                    self.set_non_null_at(idx);
+                    self.[<set_ $NATIVE>](idx, value);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! fn_max_min_idx {
+    ($NATIVE: ident, $OP: ident) => {
+        paste::item! {
+            /// check max then update
+            pub fn [<$OP _ $NATIVE>](&mut self, idx: usize, value: $NATIVE) {
+                if self.is_valid_at(idx) {
+                    let v = value.$OP(self.[<get_ $NATIVE>](idx));
+                    self.[<set_ $NATIVE>](idx, v);
+                } else {
+                    self.set_non_null_at(idx);
+                    self.[<set_ $NATIVE>](idx, value);
+                }
+            }
+        }
+    };
+}
+
+impl<'a> RowAccessor<'a> {
+    /// new
+    pub fn new(schema: &Schema, row_type: RowType) -> Self {
+        Self {
+            layout: RowLayout::new(schema, row_type),
+            data: &mut [],
+            base_offset: 0,
+        }
+    }
+
+    /// Update this row to point to position `offset` in `base`
+    pub fn point_to(&mut self, offset: usize, data: &'a mut [u8]) {
+        self.base_offset = offset;
+        self.data = data;
+    }
+
+    #[inline]
+    fn assert_index_valid(&self, idx: usize) {
+        assert!(idx < self.layout.field_count);
+    }
+
+    #[inline(always)]
+    fn field_offsets(&self) -> &[usize] {
+        &self.layout.field_offsets
+    }
+
+    #[inline(always)]
+    fn null_free(&self) -> bool {
+        self.layout.null_free
+    }
+
+    #[inline(always)]
+    fn null_bits(&self) -> &[u8] {
+        if self.null_free() {
+            &[]
+        } else {
+            let start = self.base_offset;
+            &self.data[start..start + self.layout.null_width]
+        }
+    }
+
+    #[inline(always)]
+    fn all_valid(&self) -> bool {
+        if self.null_free() {
+            true
+        } else {
+            let null_bits = self.null_bits();
+            all_valid(null_bits, self.layout.field_count)
+        }
+    }
+
+    fn is_valid_at(&self, idx: usize) -> bool {
+        unsafe { get_bit_raw(self.null_bits().as_ptr(), idx) }
+    }
+
+    // ------------------------------
+    // ----- Fixed Sized getters ----
+    // ------------------------------
+
+    fn get_bool(&self, idx: usize) -> bool {
+        self.assert_index_valid(idx);
+        let offset = self.field_offsets()[idx];
+        let value = &self.data[self.base_offset + offset..];
+        value[0] != 0
+    }
+
+    fn get_u8(&self, idx: usize) -> u8 {
+        self.assert_index_valid(idx);
+        let offset = self.field_offsets()[idx];
+        self.data[self.base_offset + offset]
+    }
+
+    fn_get_idx!(u16, 2);
+    fn_get_idx!(u32, 4);
+    fn_get_idx!(u64, 8);
+    fn_get_idx!(i8, 1);
+    fn_get_idx!(i16, 2);
+    fn_get_idx!(i32, 4);
+    fn_get_idx!(i64, 8);
+    fn_get_idx!(f32, 4);
+    fn_get_idx!(f64, 8);
+
+    fn get_date32(&self, idx: usize) -> i32 {
+        get_idx!(i32, self, idx, 4)
+    }
+
+    fn get_date64(&self, idx: usize) -> i64 {
+        get_idx!(i64, self, idx, 8)
+    }
+
+    fn_get_idx_opt!(bool);
+    fn_get_idx_opt!(u8);
+    fn_get_idx_opt!(u16);
+    fn_get_idx_opt!(u32);
+    fn_get_idx_opt!(u64);
+    fn_get_idx_opt!(i8);
+    fn_get_idx_opt!(i16);
+    fn_get_idx_opt!(i32);
+    fn_get_idx_opt!(i64);
+    fn_get_idx_opt!(f32);
+    fn_get_idx_opt!(f64);
+
+    fn get_date32_opt(&self, idx: usize) -> Option<i32> {
+        if self.is_valid_at(idx) {
+            Some(self.get_date32(idx))
+        } else {
+            None
+        }
+    }
+
+    fn get_date64_opt(&self, idx: usize) -> Option<i64> {
+        if self.is_valid_at(idx) {
+            Some(self.get_date64(idx))
+        } else {
+            None
+        }
+    }
+
+    // ------------------------------
+    // ----- Fixed Sized setters ----
+    // ------------------------------
+
+    pub(crate) fn set_null_at(&mut self, idx: usize) {
+        assert!(
+            !self.null_free(),
+            "Unexpected call to set_null_at on null-free row writer"
+        );
+        let null_bits = &mut self.data[0..self.layout.null_width];
+        unsafe {
+            unset_bit_raw(null_bits.as_mut_ptr(), idx);
+        }
+    }
+
+    pub(crate) fn set_non_null_at(&mut self, idx: usize) {
+        assert!(
+            !self.null_free(),
+            "Unexpected call to set_non_null_at on null-free row writer"
+        );
+        let null_bits = &mut self.data[0..self.layout.null_width];
+        unsafe {
+            set_bit_raw(null_bits.as_mut_ptr(), idx);
+        }
+    }
+
+    fn set_bool(&mut self, idx: usize, value: bool) {
+        self.assert_index_valid(idx);
+        let offset = self.field_offsets()[idx];
+        self.data[offset] = if value { 1 } else { 0 };
+    }
+
+    fn set_u8(&mut self, idx: usize, value: u8) {
+        self.assert_index_valid(idx);
+        let offset = self.field_offsets()[idx];
+        self.data[offset] = value;
+    }
+
+    fn_set_idx!(u16, 2);
+    fn_set_idx!(u32, 4);
+    fn_set_idx!(u64, 8);
+    fn_set_idx!(i16, 2);
+    fn_set_idx!(i32, 4);
+    fn_set_idx!(i64, 8);
+    fn_set_idx!(f32, 4);
+    fn_set_idx!(f64, 8);
+
+    fn set_i8(&mut self, idx: usize, value: i8) {
+        self.assert_index_valid(idx);
+        let offset = self.field_offsets()[idx];
+        self.data[offset] = value.to_le_bytes()[0];
+    }
+
+    fn set_date32(&mut self, idx: usize, value: i32) {
+        set_idx!(4, self, idx, value)
+    }
+
+    fn set_date64(&mut self, idx: usize, value: i64) {
+        set_idx!(8, self, idx, value)
+    }
+
+    // ------------------------------
+    // ---- Fixed sized updaters ----
+    // ------------------------------
+
+    fn_add_idx!(u8);
+    fn_add_idx!(u16);
+    fn_add_idx!(u32);
+    fn_add_idx!(u64);
+    fn_add_idx!(i8);
+    fn_add_idx!(i16);
+    fn_add_idx!(i32);
+    fn_add_idx!(i64);
+    fn_add_idx!(f32);
+    fn_add_idx!(f64);
+
+    fn_max_min_idx!(u8, max);
+    fn_max_min_idx!(u16, max);
+    fn_max_min_idx!(u32, max);
+    fn_max_min_idx!(u64, max);
+    fn_max_min_idx!(i8, max);
+    fn_max_min_idx!(i16, max);
+    fn_max_min_idx!(i32, max);
+    fn_max_min_idx!(i64, max);
+    fn_max_min_idx!(f32, max);
+    fn_max_min_idx!(f64, max);
+
+    fn_max_min_idx!(u8, min);
+    fn_max_min_idx!(u16, min);
+    fn_max_min_idx!(u32, min);
+    fn_max_min_idx!(u64, min);
+    fn_max_min_idx!(i8, min);
+    fn_max_min_idx!(i16, min);
+    fn_max_min_idx!(i32, min);
+    fn_max_min_idx!(i64, min);
+    fn_max_min_idx!(f32, min);
+    fn_max_min_idx!(f64, min);
+}
diff --git a/datafusion/row/src/layout.rs b/datafusion/row/src/layout.rs
index b017d195836d4..2c4c15da5a09e 100644
--- a/datafusion/row/src/layout.rs
+++ b/datafusion/row/src/layout.rs
@@ -39,7 +39,7 @@ pub enum RowType {
 
 /// Reveals how the fields of a record are stored in the raw-bytes format
 #[derive(Debug)]
-pub(crate) struct RowLayout {
+pub struct RowLayout {
     /// Type of the layout
     row_type: RowType,
     /// If a row is null free according to its schema
@@ -55,7 +55,8 @@ pub(crate) struct RowLayout {
 }
 
 impl RowLayout {
-    pub(crate) fn new(schema: &Schema, row_type: RowType) -> Self {
+    /// new
+    pub fn new(schema: &Schema, row_type: RowType) -> Self {
         assert!(row_supported(schema, row_type));
         let null_free = schema_null_free(schema);
         let field_count = schema.fields().len();
@@ -81,8 +82,9 @@ impl RowLayout {
         }
     }
 
+    /// Get fixed part width for this layout
     #[inline(always)]
-    pub(crate) fn fixed_part_width(&self) -> usize {
+    pub fn fixed_part_width(&self) -> usize {
         self.null_width + self.values_width
     }
 }
diff --git a/datafusion/row/src/lib.rs b/datafusion/row/src/lib.rs
index 54c112dd5e063..f954b16bc36cd 100644
--- a/datafusion/row/src/lib.rs
+++ b/datafusion/row/src/lib.rs
@@ -54,6 +54,7 @@ use arrow::record_batch::RecordBatch;
 pub use layout::RowType;
 use std::sync::Arc;
 
+pub mod accessor;
 #[cfg(feature = "jit")]
 pub mod jit;
 pub mod layout;
diff --git a/datafusion/row/src/reader.rs b/datafusion/row/src/reader.rs
index e7ee004b0076d..77e9a552cbf84 100644
--- a/datafusion/row/src/reader.rs
+++ b/datafusion/row/src/reader.rs
@@ -46,6 +46,7 @@ pub fn read_as_batch(
     output.output().map_err(DataFusionError::ArrowError)
 }
 
+#[macro_export]
 macro_rules! get_idx {
     ($NATIVE: ident, $SELF: ident, $IDX: ident, $WIDTH: literal) => {{
         $SELF.assert_index_valid($IDX);
@@ -56,6 +57,7 @@ macro_rules! get_idx {
     }};
 }
 
+#[macro_export]
 macro_rules! fn_get_idx {
     ($NATIVE: ident, $WIDTH: literal) => {
         paste::item! {
@@ -70,6 +72,7 @@ macro_rules! fn_get_idx {
     };
 }
 
+#[macro_export]
 macro_rules! fn_get_idx_opt {
     ($NATIVE: ident) => {
         paste::item! {
diff --git a/datafusion/row/src/writer.rs b/datafusion/row/src/writer.rs
index 6b9ffdc0e31d5..d71e1dbc073c1 100644
--- a/datafusion/row/src/writer.rs
+++ b/datafusion/row/src/writer.rs
@@ -75,6 +75,7 @@ pub fn bench_write_batch(
     Ok(lengths)
 }
 
+#[macro_export]
 macro_rules! set_idx {
     ($WIDTH: literal, $SELF: ident, $IDX: ident, $VALUE: ident) => {{
         $SELF.assert_index_valid($IDX);
@@ -83,6 +84,7 @@ macro_rules! set_idx {
     }};
 }
 
+#[macro_export]
 macro_rules! fn_set_idx {
     ($NATIVE: ident, $WIDTH: literal) => {
         paste::item! {

From ddfd601d20f471112dc3be61c90411e71aa7375e Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 29 Apr 2022 13:25:38 +0800
Subject: [PATCH 3/8] main updating procedure

---
 .../core/src/physical_plan/aggregates/mod.rs  | 18 +++++++++++
 .../src/physical_plan/aggregates/row_hash.rs  | 30 ++++++++++++++-----
 .../physical-expr/src/aggregate/average.rs    |  7 +++++
 .../physical-expr/src/aggregate/count.rs      |  4 +++
 .../physical-expr/src/aggregate/min_max.rs    |  8 +++++
 datafusion/physical-expr/src/aggregate/mod.rs |  5 ++++
 datafusion/physical-expr/src/aggregate/sum.rs |  4 +++
 7 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs
index d2cb486dba3ca..a91fb2da5102a 100644
--- a/datafusion/core/src/physical_plan/aggregates/mod.rs
+++ b/datafusion/core/src/physical_plan/aggregates/mod.rs
@@ -45,6 +45,7 @@ mod no_grouping;
 mod row_hash;
 
 pub use datafusion_expr::AggregateFunction;
+use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator;
 pub use datafusion_physical_expr::expressions::create_aggregate_expr;
 
 /// Hash aggregate modes
@@ -366,6 +367,7 @@ fn merge_expressions(
 }
 
 pub(crate) type AccumulatorItem = Box<dyn Accumulator>;
+pub(crate) type AccumulatorItemV2 = Box<dyn RowAccumulator>;
 
 fn create_accumulators(
     aggr_expr: &[Arc<dyn AggregateExpr>],
@@ -376,6 +378,22 @@ fn create_accumulators(
         .collect::<datafusion_common::Result<Vec<_>>>()
 }
 
+fn check_accumulator_v2_supported(
+    aggr_expr: &[Arc<dyn AggregateExpr>]
+) -> bool {
+    aggr_expr.iter().all(|expr| expr.row_state_supported())
+}
+
+fn create_accumulators_v2(
+    aggr_expr: &[Arc<dyn AggregateExpr>],
+) -> datafusion_common::Result<Vec<AccumulatorItemV2>> {
+    aggr_expr
+        .iter()
+        .enumerate()
+        .map(|(idx, expr)| expr.create_accumulator_v2(idx))
+        .collect::<datafusion_common::Result<Vec<_>>>()
+}
+
 /// returns a vector of ArrayRefs, where each entry corresponds to either the
 /// final value (mode = Final) or states (mode = Partial)
 fn finalize_aggregation(
diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
index 251adb23790b4..943f38f9c02d0 100644
--- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs
+++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
@@ -28,7 +28,7 @@ use futures::{
 };
 
 use crate::error::Result;
-use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode};
+use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode, AccumulatorItemV2};
 use crate::physical_plan::hash_utils::create_row_hashes;
 use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
 use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr};
@@ -45,6 +45,7 @@ use datafusion_row::layout::RowLayout;
 use datafusion_row::writer::{write_row, RowWriter};
 use datafusion_row::RowType;
 use hashbrown::raw::RawTable;
+use datafusion_row::accessor::RowAccessor;
 
 /*
 The architecture is the following:
@@ -80,6 +81,7 @@ pub(crate) struct GroupedRowHashAggregateStream {
 
     aggr_expr: Vec<Arc<dyn AggregateExpr>>,
     group_expr: Vec<Arc<dyn PhysicalExpr>>,
+    accs_v2: Vec<AccumulatorItemV2>,
 
     group_schema: SchemaRef,
     aggr_schema: SchemaRef,
@@ -117,6 +119,8 @@ impl GroupedRowHashAggregateStream {
         let aggregate_expressions =
             aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?;
 
+        let accs_v2 = aggregates::create_accumulators_v2(&aggr_expr)?;
+
         let (group_schema, aggr_schema) =
             create_separate_schema(&schema, group_expr.len());
         let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned);
@@ -129,6 +133,7 @@ impl GroupedRowHashAggregateStream {
             input,
             aggr_expr,
             group_expr,
+            accs_v2,
             group_schema,
             aggr_schema,
             aggr_layout,
@@ -165,6 +170,7 @@ impl Stream for GroupedRowHashAggregateStream {
                         &this.random_state,
                         &this.group_expr,
                         &this.aggr_expr,
+                        &mut this.accs_v2,
                         &this.group_schema,
                         &this.aggr_schema,
                         &this.aggr_layout,
@@ -216,6 +222,7 @@ fn group_aggregate_batch(
     random_state: &RandomState,
     group_expr: &[Arc<dyn PhysicalExpr>],
     aggr_expr: &[Arc<dyn AggregateExpr>],
+    accs_v2: &mut [AccumulatorItemV2],
     group_schema: &Schema,
     aggr_schema: &Schema,
     aggr_row_layout: &RowLayout,
@@ -232,6 +239,7 @@ fn group_aggregate_batch(
     // We could evaluate them after the `take`, but since we need to evaluate all
     // of them anyways, it is more performant to do it while they are together.
     let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?;
+    let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned);
 
     // 1.1 construct the key from the group values
     // 1.2 construct the mapping key if it does not exist
@@ -325,8 +333,7 @@ fn group_aggregate_batch(
         .try_for_each(|(group_idx, offsets)| {
             let group_state = &mut accumulators.group_states[*group_idx];
             // 2.2
-            group_state
-                .accumulator_set
+            accs_v2
                 .iter_mut()
                 .zip(values.iter())
                 .map(|(accumulator, aggr_array)| {
@@ -341,11 +348,14 @@ fn group_aggregate_batch(
                             .collect::<Vec<ArrayRef>>(),
                     )
                 })
-                .try_for_each(|(accumulator, values)| match mode {
-                    AggregateMode::Partial => accumulator.update_batch(&values),
-                    AggregateMode::FinalPartitioned | AggregateMode::Final => {
-                        // note: the aggregation here is over states, not values, thus the merge
-                        accumulator.merge_batch(&values)
+                .try_for_each(|(accumulator, values)| {
+                    state_accessor.point_to(0, group_state.aggregation_buffer.as_mut_slice());
+                    match mode {
+                        AggregateMode::Partial => accumulator.update_batch(&values, &mut state_accessor),
+                        AggregateMode::FinalPartitioned | AggregateMode::Final => {
+                            // note: the aggregation here is over states, not values, thus the merge
+                            accumulator.merge_batch(&values, &mut state_accessor)
+                        }
                     }
                 })
                 // 2.5
@@ -410,3 +420,7 @@ fn create_group_rows(arrays: Vec<ArrayRef>, schema: &Schema) -> Vec<Vec<u8>> {
     }
     results
 }
+
+fn create_state_accessor(schema: &Schema) -> RowAccessor {
+    RowAccessor::
+}
diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs
index a917af3993732..10acde5170381 100644
--- a/datafusion/physical-expr/src/aggregate/average.rs
+++ b/datafusion/physical-expr/src/aggregate/average.rs
@@ -119,6 +119,13 @@ impl AggregateExpr for Avg {
                 | DataType::Float64
         )
     }
+
+    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(AvgRowAccumulator::new(
+            start_index,
+            self.data_type.clone(),
+        )))
+    }
 }
 
 /// An accumulator to compute the average
diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs
index b3210900fca6c..b66b98470760e 100644
--- a/datafusion/physical-expr/src/aggregate/count.rs
+++ b/datafusion/physical-expr/src/aggregate/count.rs
@@ -99,6 +99,10 @@ impl AggregateExpr for Count {
     fn row_state_supported(&self) -> bool {
         true
     }
+
+    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(CountRowAccumulator::new(start_index)))
+    }
 }
 
 #[derive(Debug)]
diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs
index 55ab77cbd647c..9abbea062efda 100644
--- a/datafusion/physical-expr/src/aggregate/min_max.rs
+++ b/datafusion/physical-expr/src/aggregate/min_max.rs
@@ -129,6 +129,10 @@ impl AggregateExpr for Max {
                 | DataType::Float64
         )
     }
+
+    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(MaxRowAccumulator::new(start_index)))
+    }
 }
 
 // Statically-typed version of min/max(array) -> ScalarValue for string types.
@@ -642,6 +646,10 @@ impl AggregateExpr for Min {
                 | DataType::Float64
         )
     }
+
+    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(MinRowAccumulator::new(start_index)))
+    }
 }
 
 /// An accumulator to compute the minimum value
diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs
index 13bef9018e5a9..411638a43ced2 100644
--- a/datafusion/physical-expr/src/aggregate/mod.rs
+++ b/datafusion/physical-expr/src/aggregate/mod.rs
@@ -22,6 +22,7 @@ use datafusion_expr::Accumulator;
 use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
+use crate::aggregate::row_accumulator::RowAccumulator;
 
 pub(crate) mod approx_distinct;
 pub(crate) mod approx_median;
@@ -81,4 +82,8 @@ pub trait AggregateExpr: Send + Sync + Debug {
     fn row_state_supported(&self) -> bool {
         false
     }
+
+    fn create_accumulator_v2(&self, _start_index: usize) -> Result<Box<dyn RowAccumulator>> {
+        unreachable!()
+    }
 }
diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs
index 82c34df3786f5..431a377871d81 100644
--- a/datafusion/physical-expr/src/aggregate/sum.rs
+++ b/datafusion/physical-expr/src/aggregate/sum.rs
@@ -114,6 +114,10 @@ impl AggregateExpr for Sum {
                 | DataType::Float64
         )
     }
+
+    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(SumRowAccumulator::new(start_index, self.data_type.clone())))
+    }
 }
 
 #[derive(Debug)]

From 2dd2d16a692b5068007d5c93d86821cfe6ef0ecb Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 29 Apr 2022 19:04:42 +0800
Subject: [PATCH 4/8] output as record batch

---
 .../core/src/physical_plan/aggregates/mod.rs  |  10 +-
 .../src/physical_plan/aggregates/row_hash.rs  | 104 +++++++++++++---
 .../{row_accumulator.rs => accumulator_v2.rs} |   7 +-
 .../physical-expr/src/aggregate/average.rs    |  30 +++--
 .../physical-expr/src/aggregate/count.rs      |  21 ++--
 .../physical-expr/src/aggregate/min_max.rs    |  84 ++++++++-----
 datafusion/physical-expr/src/aggregate/mod.rs |  11 +-
 datafusion/physical-expr/src/aggregate/sum.rs |  24 ++--
 datafusion/row/src/accessor.rs                | 113 ++++++++----------
 datafusion/row/src/layout.rs                  |   2 +-
 datafusion/row/src/lib.rs                     |  10 +-
 datafusion/row/src/reader.rs                  |   2 +-
 12 files changed, 274 insertions(+), 144 deletions(-)
 rename datafusion/physical-expr/src/aggregate/{row_accumulator.rs => accumulator_v2.rs} (85%)

diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs
index a91fb2da5102a..45f40629b4fc0 100644
--- a/datafusion/core/src/physical_plan/aggregates/mod.rs
+++ b/datafusion/core/src/physical_plan/aggregates/mod.rs
@@ -45,7 +45,7 @@ mod no_grouping;
 mod row_hash;
 
 pub use datafusion_expr::AggregateFunction;
-use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator;
+use datafusion_physical_expr::aggregate::accumulator_v2::AccumulatorV2;
 pub use datafusion_physical_expr::expressions::create_aggregate_expr;
 
 /// Hash aggregate modes
@@ -367,7 +367,7 @@ fn merge_expressions(
 }
 
 pub(crate) type AccumulatorItem = Box<dyn Accumulator>;
-pub(crate) type AccumulatorItemV2 = Box<dyn RowAccumulator>;
+pub(crate) type AccumulatorItemV2 = Box<dyn AccumulatorV2>;
 
 fn create_accumulators(
     aggr_expr: &[Arc<dyn AggregateExpr>],
@@ -378,10 +378,8 @@ fn create_accumulators(
         .collect::<datafusion_common::Result<Vec<_>>>()
 }
 
-fn check_accumulator_v2_supported(
-    aggr_expr: &[Arc<dyn AggregateExpr>]
-) -> bool {
-    aggr_expr.iter().all(|expr| expr.row_state_supported())
+fn check_accumulator_v2_supported(aggr_expr: &[Arc<dyn AggregateExpr>]) -> bool {
+    aggr_expr.iter().all(|expr| expr.accumulator_v2_supported())
 }
 
 fn create_accumulators_v2(
diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
index 943f38f9c02d0..2da8e39d42d8a 100644
--- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs
+++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
@@ -28,12 +28,15 @@ use futures::{
 };
 
 use crate::error::Result;
-use crate::physical_plan::aggregates::{evaluate, evaluate_many, AggregateMode, AccumulatorItemV2};
+use crate::physical_plan::aggregates::{
+    evaluate, evaluate_many, AccumulatorItemV2, AggregateMode,
+};
 use crate::physical_plan::hash_utils::create_row_hashes;
 use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
 use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr};
 use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream};
 
+use arrow::compute::cast;
 use arrow::datatypes::Schema;
 use arrow::{array::ArrayRef, compute};
 use arrow::{
@@ -41,11 +44,13 @@ use arrow::{
     error::{ArrowError, Result as ArrowResult},
 };
 use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
+use datafusion_common::ScalarValue;
+use datafusion_row::accessor::RowAccessor;
 use datafusion_row::layout::RowLayout;
+use datafusion_row::reader::{read_row, RowReader};
 use datafusion_row::writer::{write_row, RowWriter};
-use datafusion_row::RowType;
+use datafusion_row::{MutableRecordBatch, RowType};
 use hashbrown::raw::RawTable;
-use datafusion_row::accessor::RowAccessor;
 
 /*
 The architecture is the following:
@@ -79,7 +84,6 @@ pub(crate) struct GroupedRowHashAggregateStream {
     accumulators: Accumulators,
     aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
 
-    aggr_expr: Vec<Arc<dyn AggregateExpr>>,
     group_expr: Vec<Arc<dyn PhysicalExpr>>,
     accs_v2: Vec<AccumulatorItemV2>,
 
@@ -131,7 +135,6 @@ impl GroupedRowHashAggregateStream {
             schema,
             mode,
             input,
-            aggr_expr,
             group_expr,
             accs_v2,
             group_schema,
@@ -169,10 +172,8 @@ impl Stream for GroupedRowHashAggregateStream {
                         &this.mode,
                         &this.random_state,
                         &this.group_expr,
-                        &this.aggr_expr,
                         &mut this.accs_v2,
                         &this.group_schema,
-                        &this.aggr_schema,
                         &this.aggr_layout,
                         this.aggr_buffer_width,
                         batch,
@@ -193,8 +194,10 @@ impl Stream for GroupedRowHashAggregateStream {
                     let timer = this.baseline_metrics.elapsed_compute().timer();
                     let result = create_batch_from_map(
                         &this.mode,
-                        &this.accumulators,
-                        this.group_expr.len(),
+                        &this.group_schema,
+                        &this.aggr_schema,
+                        &mut this.accumulators,
+                        &mut this.accs_v2,
                         &this.schema,
                     )
                     .record_output(&this.baseline_metrics);
@@ -221,11 +224,9 @@ fn group_aggregate_batch(
     mode: &AggregateMode,
     random_state: &RandomState,
     group_expr: &[Arc<dyn PhysicalExpr>],
-    aggr_expr: &[Arc<dyn AggregateExpr>],
     accs_v2: &mut [AccumulatorItemV2],
     group_schema: &Schema,
-    aggr_schema: &Schema,
-    aggr_row_layout: &RowLayout,
+    state_layout: &RowLayout,
     aggr_buffer_width: usize,
     batch: RecordBatch,
     accumulators: &mut Accumulators,
@@ -239,7 +240,6 @@ fn group_aggregate_batch(
     // We could evaluate them after the `take`, but since we need to evaluate all
     // of them anyways, it is more performant to do it while they are together.
     let aggr_input_values = evaluate_many(aggregate_expressions, &batch)?;
-    let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned);
 
     // 1.1 construct the key from the group values
     // 1.2 construct the mapping key if it does not exist
@@ -349,9 +349,14 @@ fn group_aggregate_batch(
                     )
                 })
                 .try_for_each(|(accumulator, values)| {
-                    state_accessor.point_to(0, group_state.aggregation_buffer.as_mut_slice());
+                    let mut state_accessor =
+                        RowAccessor::new_from_layout(state_layout.clone());
+                    state_accessor
+                        .point_to(0, group_state.aggregation_buffer.as_mut_slice());
                     match mode {
-                        AggregateMode::Partial => accumulator.update_batch(&values, &mut state_accessor),
+                        AggregateMode::Partial => {
+                            accumulator.update_batch(&values, &mut state_accessor)
+                        }
                         AggregateMode::FinalPartitioned | AggregateMode::Final => {
                             // note: the aggregation here is over states, not values, thus the merge
                             accumulator.merge_batch(&values, &mut state_accessor)
@@ -421,6 +426,71 @@ fn create_group_rows(arrays: Vec<ArrayRef>, schema: &Schema) -> Vec<Vec<u8>> {
     results
 }
 
-fn create_state_accessor(schema: &Schema) -> RowAccessor {
-    RowAccessor::
+/// Create a RecordBatch with all group keys and accumulator' states or values.
+fn create_batch_from_map(
+    mode: &AggregateMode,
+    group_schema: &Schema,
+    aggr_schema: &Schema,
+    accumulators: &mut Accumulators,
+    accs_v2: &mut [AccumulatorItemV2],
+    output_schema: &Schema,
+) -> ArrowResult<RecordBatch> {
+    if accumulators.group_states.is_empty() {
+        return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned())));
+    }
+
+    let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned);
+
+    let (group_buffers, mut state_buffers): (Vec<_>, Vec<_>) = accumulators
+        .group_states
+        .iter()
+        .map(|gs| (gs.group_by_values.clone(), gs.aggregation_buffer.clone()))
+        .unzip();
+
+    let mut columns: Vec<ArrayRef> =
+        read_as_batch(&group_buffers, group_schema, RowType::Compact);
+
+    match mode {
+        AggregateMode::Partial => columns.extend(read_as_batch(
+            &state_buffers,
+            aggr_schema,
+            RowType::WordAligned,
+        )),
+        AggregateMode::Final | AggregateMode::FinalPartitioned => {
+            let mut results: Vec<Vec<ScalarValue>> = vec![vec![]; accs_v2.len()];
+            for buffer in state_buffers.iter_mut() {
+                state_accessor.point_to(0, buffer);
+                for (i, acc) in accs_v2.iter().enumerate() {
+                    results[i].push(acc.evaluate(&state_accessor).unwrap());
+                }
+            }
+            for scalars in results {
+                columns.push(ScalarValue::iter_to_array(scalars)?);
+            }
+        }
+    }
+
+    // cast output if needed (e.g. for types like Dictionary where
+    // the intermediate GroupByScalar type was not the same as the
+    // output
+    let columns = columns
+        .iter()
+        .zip(output_schema.fields().iter())
+        .map(|(col, desired_field)| cast(col, desired_field.data_type()))
+        .collect::<ArrowResult<Vec<_>>>()?;
+
+    RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns)
+}
+
+fn read_as_batch(rows: &[Vec<u8>], schema: &Schema, row_type: RowType) -> Vec<ArrayRef> {
+    let row_num = rows.len();
+    let mut output = MutableRecordBatch::new(row_num, Arc::new(schema.clone()));
+    let mut row = RowReader::new(&schema, row_type);
+
+    for data in rows {
+        row.point_to(0, data);
+        read_row(&row, &mut output, schema);
+    }
+
+    output.output_as_columns()
 }
diff --git a/datafusion/physical-expr/src/aggregate/row_accumulator.rs b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs
similarity index 85%
rename from datafusion/physical-expr/src/aggregate/row_accumulator.rs
rename to datafusion/physical-expr/src/aggregate/accumulator_v2.rs
index a0024f2eb3f22..dc8345064dcff 100644
--- a/datafusion/physical-expr/src/aggregate/row_accumulator.rs
+++ b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs
@@ -18,11 +18,11 @@
 //! Accumulator in raw format
 
 use arrow::array::ArrayRef;
-use datafusion_common::Result;
+use datafusion_common::{Result, ScalarValue};
 use datafusion_row::accessor::RowAccessor;
 use std::fmt::Debug;
 
-pub trait RowAccumulator: Send + Sync + Debug {
+pub trait AccumulatorV2: Send + Sync + Debug {
     /// updates the accumulator's state from a vector of arrays.
     fn update_batch(
         &mut self,
@@ -36,4 +36,7 @@ pub trait RowAccumulator: Send + Sync + Debug {
         states: &[ArrayRef],
         accessor: &mut RowAccessor,
     ) -> Result<()>;
+
+    /// returns its value based on its current state.
+    fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue>;
 }
diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs
index 10acde5170381..c95d8a07b211c 100644
--- a/datafusion/physical-expr/src/aggregate/average.rs
+++ b/datafusion/physical-expr/src/aggregate/average.rs
@@ -21,7 +21,7 @@ use std::any::Any;
 use std::convert::TryFrom;
 use std::sync::Arc;
 
-use crate::aggregate::row_accumulator::RowAccumulator;
+use crate::aggregate::accumulator_v2::AccumulatorV2;
 use crate::aggregate::sum;
 use crate::expressions::format_state_name;
 use crate::{AggregateExpr, PhysicalExpr};
@@ -104,7 +104,7 @@ impl AggregateExpr for Avg {
         &self.name
     }
 
-    fn row_state_supported(&self) -> bool {
+    fn accumulator_v2_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -120,8 +120,11 @@ impl AggregateExpr for Avg {
         )
     }
 
-    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
-        Ok(Box::new(AvgRowAccumulator::new(
+    fn create_accumulator_v2(
+        &self,
+        start_index: usize,
+    ) -> Result<Box<dyn AccumulatorV2>> {
+        Ok(Box::new(AvgAccumulatorV2::new(
             start_index,
             self.data_type.clone(),
         )))
@@ -193,12 +196,12 @@ impl Accumulator for AvgAccumulator {
 }
 
 #[derive(Debug)]
-struct AvgRowAccumulator {
+struct AvgAccumulatorV2 {
     start_index: usize,
     sum_datatype: DataType,
 }
 
-impl AvgRowAccumulator {
+impl AvgAccumulatorV2 {
     pub fn new(start_index: usize, sum_datatype: DataType) -> Self {
         Self {
             start_index,
@@ -207,7 +210,7 @@ impl AvgRowAccumulator {
     }
 }
 
-impl RowAccumulator for AvgRowAccumulator {
+impl AccumulatorV2 for AvgAccumulatorV2 {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -243,6 +246,19 @@ impl RowAccumulator for AvgRowAccumulator {
         )?;
         Ok(())
     }
+
+    fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
+        assert_eq!(self.sum_datatype, DataType::Float64);
+        Ok(match accessor.get_u64_opt(self.start_index) {
+            None => ScalarValue::Float64(None),
+            Some(0) => ScalarValue::Float64(Some(0.0)),
+            Some(n) => ScalarValue::Float64(
+                accessor
+                    .get_f64_opt(self.start_index + 1)
+                    .map(|f| f / n as f64),
+            ),
+        })
+    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs
index b66b98470760e..9ccd13d5753ed 100644
--- a/datafusion/physical-expr/src/aggregate/count.rs
+++ b/datafusion/physical-expr/src/aggregate/count.rs
@@ -21,7 +21,7 @@ use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use crate::aggregate::row_accumulator::RowAccumulator;
+use crate::aggregate::accumulator_v2::AccumulatorV2;
 use crate::{AggregateExpr, PhysicalExpr};
 use arrow::compute;
 use arrow::datatypes::DataType;
@@ -96,12 +96,15 @@ impl AggregateExpr for Count {
         &self.name
     }
 
-    fn row_state_supported(&self) -> bool {
+    fn accumulator_v2_supported(&self) -> bool {
         true
     }
 
-    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
-        Ok(Box::new(CountRowAccumulator::new(start_index)))
+    fn create_accumulator_v2(
+        &self,
+        start_index: usize,
+    ) -> Result<Box<dyn AccumulatorV2>> {
+        Ok(Box::new(CountAccumulatorV2::new(start_index)))
     }
 }
 
@@ -143,17 +146,17 @@ impl Accumulator for CountAccumulator {
 }
 
 #[derive(Debug)]
-struct CountRowAccumulator {
+struct CountAccumulatorV2 {
     index: usize,
 }
 
-impl CountRowAccumulator {
+impl CountAccumulatorV2 {
     pub fn new(index: usize) -> Self {
         Self { index }
     }
 }
 
-impl RowAccumulator for CountRowAccumulator {
+impl AccumulatorV2 for CountAccumulatorV2 {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -177,6 +180,10 @@ impl RowAccumulator for CountRowAccumulator {
         }
         Ok(())
     }
+
+    fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
+        Ok(accessor.get_as_scalar(&DataType::UInt64, self.index))
+    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs
index 9abbea062efda..2062d16c71245 100644
--- a/datafusion/physical-expr/src/aggregate/min_max.rs
+++ b/datafusion/physical-expr/src/aggregate/min_max.rs
@@ -37,7 +37,7 @@ use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::Accumulator;
 
-use crate::aggregate::row_accumulator::RowAccumulator;
+use crate::aggregate::accumulator_v2::AccumulatorV2;
 use crate::expressions::format_state_name;
 use arrow::array::Array;
 use arrow::array::DecimalArray;
@@ -114,7 +114,7 @@ impl AggregateExpr for Max {
         &self.name
     }
 
-    fn row_state_supported(&self) -> bool {
+    fn accumulator_v2_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -130,8 +130,14 @@ impl AggregateExpr for Max {
         )
     }
 
-    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
-        Ok(Box::new(MaxRowAccumulator::new(start_index)))
+    fn create_accumulator_v2(
+        &self,
+        start_index: usize,
+    ) -> Result<Box<dyn AccumulatorV2>> {
+        Ok(Box::new(MaxAccumulatorV2::new(
+            start_index,
+            self.data_type.clone(),
+        )))
     }
 }
 
@@ -326,7 +332,7 @@ macro_rules! typed_min_max {
 }
 
 // min/max of two non-string scalar values.
-macro_rules! typed_min_max_row {
+macro_rules! typed_min_max_v2 {
     ($INDEX:ident, $ACC:ident, $SCALAR:expr, $TYPE:ident, $OP:ident) => {{
         paste::item! {
             match $SCALAR {
@@ -443,38 +449,38 @@ macro_rules! min_max {
 }
 
 // min/max of two scalar values of the same type
-macro_rules! min_max_row {
+macro_rules! min_max_v2 {
     ($INDEX:ident, $ACC:ident, $SCALAR:expr, $OP:ident) => {{
         Ok(match $SCALAR {
             ScalarValue::Float64(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, f64, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, f64, $OP)
             }
             ScalarValue::Float32(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, f32, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, f32, $OP)
             }
             ScalarValue::UInt64(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, u64, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, u64, $OP)
             }
             ScalarValue::UInt32(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, u32, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, u32, $OP)
             }
             ScalarValue::UInt16(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, u16, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, u16, $OP)
             }
             ScalarValue::UInt8(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, u8, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, u8, $OP)
             }
             ScalarValue::Int64(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, i64, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, i64, $OP)
             }
             ScalarValue::Int32(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, i32, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, i32, $OP)
             }
             ScalarValue::Int16(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, i16, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, i16, $OP)
             }
             ScalarValue::Int8(rhs) => {
-                typed_min_max_row!($INDEX, $ACC, rhs, i8, $OP)
+                typed_min_max_v2!($INDEX, $ACC, rhs, i8, $OP)
             }
             e => {
                 return Err(DataFusionError::Internal(format!(
@@ -492,7 +498,7 @@ pub fn min(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> {
 }
 
 pub fn min_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> {
-    min_max_row!(index, accessor, s, min)
+    min_max_v2!(index, accessor, s, min)
 }
 
 /// the maximum of two scalar values
@@ -501,7 +507,7 @@ pub fn max(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> {
 }
 
 pub fn max_row(index: usize, accessor: &mut RowAccessor, s: &ScalarValue) -> Result<()> {
-    min_max_row!(index, accessor, s, max)
+    min_max_v2!(index, accessor, s, max)
 }
 
 /// An accumulator to compute the maximum value
@@ -541,17 +547,18 @@ impl Accumulator for MaxAccumulator {
 }
 
 #[derive(Debug)]
-struct MaxRowAccumulator {
+struct MaxAccumulatorV2 {
     index: usize,
+    data_type: DataType,
 }
 
-impl MaxRowAccumulator {
-    pub fn new(index: usize) -> Self {
-        Self { index }
+impl MaxAccumulatorV2 {
+    pub fn new(index: usize, data_type: DataType) -> Self {
+        Self { index, data_type }
     }
 }
 
-impl RowAccumulator for MaxRowAccumulator {
+impl AccumulatorV2 for MaxAccumulatorV2 {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -570,6 +577,10 @@ impl RowAccumulator for MaxRowAccumulator {
     ) -> Result<()> {
         self.update_batch(states, accessor)
     }
+
+    fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
+        Ok(accessor.get_as_scalar(&self.data_type, self.index))
+    }
 }
 
 /// MIN aggregate expression
@@ -631,7 +642,7 @@ impl AggregateExpr for Min {
         &self.name
     }
 
-    fn row_state_supported(&self) -> bool {
+    fn accumulator_v2_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -647,8 +658,14 @@ impl AggregateExpr for Min {
         )
     }
 
-    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
-        Ok(Box::new(MinRowAccumulator::new(start_index)))
+    fn create_accumulator_v2(
+        &self,
+        start_index: usize,
+    ) -> Result<Box<dyn AccumulatorV2>> {
+        Ok(Box::new(MinAccumulatorV2::new(
+            start_index,
+            self.data_type.clone(),
+        )))
     }
 }
 
@@ -689,17 +706,18 @@ impl Accumulator for MinAccumulator {
 }
 
 #[derive(Debug)]
-struct MinRowAccumulator {
+struct MinAccumulatorV2 {
     index: usize,
+    data_type: DataType,
 }
 
-impl MinRowAccumulator {
-    pub fn new(index: usize) -> Self {
-        Self { index }
+impl MinAccumulatorV2 {
+    pub fn new(index: usize, data_type: DataType) -> Self {
+        Self { index, data_type }
     }
 }
 
-impl RowAccumulator for MinRowAccumulator {
+impl AccumulatorV2 for MinAccumulatorV2 {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -718,6 +736,10 @@ impl RowAccumulator for MinRowAccumulator {
     ) -> Result<()> {
         self.update_batch(states, accessor)
     }
+
+    fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
+        Ok(accessor.get_as_scalar(&self.data_type, self.index))
+    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs
index 411638a43ced2..327c40ad29c24 100644
--- a/datafusion/physical-expr/src/aggregate/mod.rs
+++ b/datafusion/physical-expr/src/aggregate/mod.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use crate::aggregate::accumulator_v2::AccumulatorV2;
 use crate::PhysicalExpr;
 use arrow::datatypes::Field;
 use datafusion_common::Result;
@@ -22,7 +23,6 @@ use datafusion_expr::Accumulator;
 use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
-use crate::aggregate::row_accumulator::RowAccumulator;
 
 pub(crate) mod approx_distinct;
 pub(crate) mod approx_median;
@@ -37,9 +37,9 @@ pub(crate) mod covariance;
 pub(crate) mod distinct_expressions;
 #[macro_use]
 pub(crate) mod min_max;
+pub mod accumulator_v2;
 pub mod build_in;
 mod hyperloglog;
-pub mod row_accumulator;
 pub(crate) mod stats;
 pub(crate) mod stddev;
 pub(crate) mod sum;
@@ -79,11 +79,14 @@ pub trait AggregateExpr: Send + Sync + Debug {
     }
 
     /// If the aggregate expression is supported by row format
-    fn row_state_supported(&self) -> bool {
+    fn accumulator_v2_supported(&self) -> bool {
         false
     }
 
-    fn create_accumulator_v2(&self, _start_index: usize) -> Result<Box<dyn RowAccumulator>> {
+    fn create_accumulator_v2(
+        &self,
+        _start_index: usize,
+    ) -> Result<Box<dyn AccumulatorV2>> {
         unreachable!()
     }
 }
diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs
index 431a377871d81..e0ed2ffc72fd7 100644
--- a/datafusion/physical-expr/src/aggregate/sum.rs
+++ b/datafusion/physical-expr/src/aggregate/sum.rs
@@ -34,7 +34,7 @@ use arrow::{
 use datafusion_common::{DataFusionError, Result, ScalarValue};
 use datafusion_expr::Accumulator;
 
-use crate::aggregate::row_accumulator::RowAccumulator;
+use crate::aggregate::accumulator_v2::AccumulatorV2;
 use crate::expressions::format_state_name;
 use arrow::array::Array;
 use arrow::array::DecimalArray;
@@ -99,7 +99,7 @@ impl AggregateExpr for Sum {
         &self.name
     }
 
-    fn row_state_supported(&self) -> bool {
+    fn accumulator_v2_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -115,8 +115,14 @@ impl AggregateExpr for Sum {
         )
     }
 
-    fn create_accumulator_v2(&self, start_index: usize) -> Result<Box<dyn RowAccumulator>> {
-        Ok(Box::new(SumRowAccumulator::new(start_index, self.data_type.clone())))
+    fn create_accumulator_v2(
+        &self,
+        start_index: usize,
+    ) -> Result<Box<dyn AccumulatorV2>> {
+        Ok(Box::new(SumAccumulatorV2::new(
+            start_index,
+            self.data_type.clone(),
+        )))
     }
 }
 
@@ -441,18 +447,18 @@ impl Accumulator for SumAccumulator {
 }
 
 #[derive(Debug)]
-struct SumRowAccumulator {
+struct SumAccumulatorV2 {
     index: usize,
     datatype: DataType,
 }
 
-impl SumRowAccumulator {
+impl SumAccumulatorV2 {
     pub fn new(index: usize, datatype: DataType) -> Self {
         Self { index, datatype }
     }
 }
 
-impl RowAccumulator for SumRowAccumulator {
+impl AccumulatorV2 for SumAccumulatorV2 {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -470,6 +476,10 @@ impl RowAccumulator for SumRowAccumulator {
     ) -> Result<()> {
         self.update_batch(states, accessor)
     }
+
+    fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
+        Ok(accessor.get_as_scalar(&self.datatype, self.index))
+    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/row/src/accessor.rs b/datafusion/row/src/accessor.rs
index 7b204fe85b7c2..ae5c74b701352 100644
--- a/datafusion/row/src/accessor.rs
+++ b/datafusion/row/src/accessor.rs
@@ -18,10 +18,11 @@
 //! Setter/Getter for row with all fixed-sized fields.
 
 use crate::layout::{RowLayout, RowType};
-use crate::validity::{all_valid, NullBitsFormatter};
-use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx, get_idx, set_idx};
-use arrow::datatypes::Schema;
-use arrow::util::bit_util::{get_bit_raw, set_bit_raw, unset_bit_raw};
+use crate::validity::NullBitsFormatter;
+use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx};
+use arrow::datatypes::{DataType, Schema};
+use arrow::util::bit_util::{get_bit_raw, set_bit_raw};
+use datafusion_common::ScalarValue;
 
 //TODO: DRY with reader and writer
 
@@ -84,6 +85,20 @@ macro_rules! fn_max_min_idx {
     };
 }
 
+macro_rules! fn_get_idx_scalar {
+    ($NATIVE: ident, $SCALAR:ident) => {
+        paste::item! {
+            pub fn [<get_ $NATIVE _scalar>](&self, idx: usize) -> ScalarValue {
+                if self.is_valid_at(idx) {
+                    ScalarValue::$SCALAR(Some(self.[<get_ $NATIVE>](idx)))
+                } else {
+                    ScalarValue::$SCALAR(None)
+                }
+            }
+        }
+    };
+}
+
 impl<'a> RowAccessor<'a> {
     /// new
     pub fn new(schema: &Schema, row_type: RowType) -> Self {
@@ -94,6 +109,14 @@ impl<'a> RowAccessor<'a> {
         }
     }
 
+    pub fn new_from_layout(layout: RowLayout) -> Self {
+        Self {
+            layout,
+            data: &mut [],
+            base_offset: 0,
+        }
+    }
+
     /// Update this row to point to position `offset` in `base`
     pub fn point_to(&mut self, offset: usize, data: &'a mut [u8]) {
         self.base_offset = offset;
@@ -125,16 +148,6 @@ impl<'a> RowAccessor<'a> {
         }
     }
 
-    #[inline(always)]
-    fn all_valid(&self) -> bool {
-        if self.null_free() {
-            true
-        } else {
-            let null_bits = self.null_bits();
-            all_valid(null_bits, self.layout.field_count)
-        }
-    }
-
     fn is_valid_at(&self, idx: usize) -> bool {
         unsafe { get_bit_raw(self.null_bits().as_ptr(), idx) }
     }
@@ -166,14 +179,6 @@ impl<'a> RowAccessor<'a> {
     fn_get_idx!(f32, 4);
     fn_get_idx!(f64, 8);
 
-    fn get_date32(&self, idx: usize) -> i32 {
-        get_idx!(i32, self, idx, 4)
-    }
-
-    fn get_date64(&self, idx: usize) -> i64 {
-        get_idx!(i64, self, idx, 8)
-    }
-
     fn_get_idx_opt!(bool);
     fn_get_idx_opt!(u8);
     fn_get_idx_opt!(u16);
@@ -186,19 +191,32 @@ impl<'a> RowAccessor<'a> {
     fn_get_idx_opt!(f32);
     fn_get_idx_opt!(f64);
 
-    fn get_date32_opt(&self, idx: usize) -> Option<i32> {
-        if self.is_valid_at(idx) {
-            Some(self.get_date32(idx))
-        } else {
-            None
-        }
-    }
-
-    fn get_date64_opt(&self, idx: usize) -> Option<i64> {
-        if self.is_valid_at(idx) {
-            Some(self.get_date64(idx))
-        } else {
-            None
+    fn_get_idx_scalar!(bool, Boolean);
+    fn_get_idx_scalar!(u8, UInt8);
+    fn_get_idx_scalar!(u16, UInt16);
+    fn_get_idx_scalar!(u32, UInt32);
+    fn_get_idx_scalar!(u64, UInt64);
+    fn_get_idx_scalar!(i8, Int8);
+    fn_get_idx_scalar!(i16, Int16);
+    fn_get_idx_scalar!(i32, Int32);
+    fn_get_idx_scalar!(i64, Int64);
+    fn_get_idx_scalar!(f32, Float32);
+    fn_get_idx_scalar!(f64, Float64);
+
+    pub fn get_as_scalar(&self, dt: &DataType, index: usize) -> ScalarValue {
+        match dt {
+            DataType::Boolean => self.get_bool_scalar(index),
+            DataType::Int8 => self.get_i8_scalar(index),
+            DataType::Int16 => self.get_i16_scalar(index),
+            DataType::Int32 => self.get_i32_scalar(index),
+            DataType::Int64 => self.get_i64_scalar(index),
+            DataType::UInt8 => self.get_u8_scalar(index),
+            DataType::UInt16 => self.get_u16_scalar(index),
+            DataType::UInt32 => self.get_u32_scalar(index),
+            DataType::UInt64 => self.get_u64_scalar(index),
+            DataType::Float32 => self.get_f32_scalar(index),
+            DataType::Float64 => self.get_f64_scalar(index),
+            _ => unreachable!(),
         }
     }
 
@@ -206,17 +224,6 @@ impl<'a> RowAccessor<'a> {
     // ----- Fixed Sized setters ----
     // ------------------------------
 
-    pub(crate) fn set_null_at(&mut self, idx: usize) {
-        assert!(
-            !self.null_free(),
-            "Unexpected call to set_null_at on null-free row writer"
-        );
-        let null_bits = &mut self.data[0..self.layout.null_width];
-        unsafe {
-            unset_bit_raw(null_bits.as_mut_ptr(), idx);
-        }
-    }
-
     pub(crate) fn set_non_null_at(&mut self, idx: usize) {
         assert!(
             !self.null_free(),
@@ -228,12 +235,6 @@ impl<'a> RowAccessor<'a> {
         }
     }
 
-    fn set_bool(&mut self, idx: usize, value: bool) {
-        self.assert_index_valid(idx);
-        let offset = self.field_offsets()[idx];
-        self.data[offset] = if value { 1 } else { 0 };
-    }
-
     fn set_u8(&mut self, idx: usize, value: u8) {
         self.assert_index_valid(idx);
         let offset = self.field_offsets()[idx];
@@ -255,14 +256,6 @@ impl<'a> RowAccessor<'a> {
         self.data[offset] = value.to_le_bytes()[0];
     }
 
-    fn set_date32(&mut self, idx: usize, value: i32) {
-        set_idx!(4, self, idx, value)
-    }
-
-    fn set_date64(&mut self, idx: usize, value: i64) {
-        set_idx!(8, self, idx, value)
-    }
-
     // ------------------------------
     // ---- Fixed sized updaters ----
     // ------------------------------
diff --git a/datafusion/row/src/layout.rs b/datafusion/row/src/layout.rs
index 2c4c15da5a09e..adbe67ea52df9 100644
--- a/datafusion/row/src/layout.rs
+++ b/datafusion/row/src/layout.rs
@@ -38,7 +38,7 @@ pub enum RowType {
 }
 
 /// Reveals how the fields of a record are stored in the raw-bytes format
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct RowLayout {
     /// Type of the layout
     row_type: RowType,
diff --git a/datafusion/row/src/lib.rs b/datafusion/row/src/lib.rs
index f954b16bc36cd..c31bf751a1190 100644
--- a/datafusion/row/src/lib.rs
+++ b/datafusion/row/src/lib.rs
@@ -47,7 +47,7 @@
 //! 0          1          2                     10              14                     22                     31         32
 //!
 
-use arrow::array::{make_builder, ArrayBuilder};
+use arrow::array::{make_builder, ArrayBuilder, ArrayRef};
 use arrow::datatypes::Schema;
 use arrow::error::Result as ArrowResult;
 use arrow::record_batch::RecordBatch;
@@ -85,6 +85,10 @@ impl MutableRecordBatch {
         let result = make_batch(self.schema.clone(), self.arrays.drain(..).collect());
         result
     }
+
+    pub fn output_as_columns(&mut self) -> Vec<ArrayRef> {
+        get_columns(self.arrays.drain(..).collect())
+    }
 }
 
 fn new_arrays(schema: &Schema, batch_size: usize) -> Vec<Box<dyn ArrayBuilder>> {
@@ -106,6 +110,10 @@ fn make_batch(
     RecordBatch::try_new(schema, columns)
 }
 
+fn get_columns(mut arrays: Vec<Box<dyn ArrayBuilder>>) -> Vec<ArrayRef> {
+    arrays.iter_mut().map(|array| array.finish()).collect()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/datafusion/row/src/reader.rs b/datafusion/row/src/reader.rs
index 77e9a552cbf84..1bf6e102a9f2c 100644
--- a/datafusion/row/src/reader.rs
+++ b/datafusion/row/src/reader.rs
@@ -76,7 +76,7 @@ macro_rules! fn_get_idx {
 macro_rules! fn_get_idx_opt {
     ($NATIVE: ident) => {
         paste::item! {
-            fn [<get_ $NATIVE _opt>](&self, idx: usize) -> Option<$NATIVE> {
+            pub fn [<get_ $NATIVE _opt>](&self, idx: usize) -> Option<$NATIVE> {
                 if self.is_valid_at(idx) {
                     Some(self.[<get_ $NATIVE>](idx))
                 } else {

From 430c31577a1790df6512ebe8b9d01ef7696b224d Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 5 May 2022 14:45:59 +0800
Subject: [PATCH 5/8] aggregate with row state

---
 .../core/src/physical_plan/aggregates/mod.rs  |  24 +++-
 .../src/physical_plan/aggregates/row_hash.rs  | 114 +++++++-----------
 .../core/src/physical_plan/hash_utils.rs      |   6 +-
 .../src/aggregate/accumulator_v2.rs           |   2 +-
 .../physical-expr/src/aggregate/average.rs    |   6 +-
 datafusion/row/src/layout.rs                  |   9 +-
 datafusion/row/src/lib.rs                     |   7 +-
 7 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs
index abb68d8010367..8e6f0c4c1b44d 100644
--- a/datafusion/core/src/physical_plan/aggregates/mod.rs
+++ b/datafusion/core/src/physical_plan/aggregates/mod.rs
@@ -43,9 +43,11 @@ mod hash;
 mod no_grouping;
 mod row_hash;
 
+use crate::physical_plan::aggregates::row_hash::GroupedHashAggregateStreamV2;
 pub use datafusion_expr::AggregateFunction;
 use datafusion_physical_expr::aggregate::accumulator_v2::AccumulatorV2;
 pub use datafusion_physical_expr::expressions::create_aggregate_expr;
+use datafusion_row::{row_supported, RowType};
 
 /// Hash aggregate modes
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -145,6 +147,12 @@ impl AggregateExec {
     pub fn input_schema(&self) -> SchemaRef {
         self.input_schema.clone()
     }
+
+    fn row_aggregate_supported(&self) -> bool {
+        let group_schema = group_schema(&self.schema, self.group_expr.len());
+        row_supported(&group_schema, RowType::Compact)
+            && accumulator_v2_supported(&self.aggr_expr)
+    }
 }
 
 impl ExecutionPlan for AggregateExec {
@@ -215,6 +223,15 @@ impl ExecutionPlan for AggregateExec {
                 input,
                 baseline_metrics,
             )?))
+        } else if self.row_aggregate_supported() {
+            Ok(Box::pin(GroupedHashAggregateStreamV2::new(
+                self.mode,
+                self.schema.clone(),
+                group_expr,
+                self.aggr_expr.clone(),
+                input,
+                baseline_metrics,
+            )?))
         } else {
             Ok(Box::pin(GroupedHashAggregateStream::new(
                 self.mode,
@@ -318,6 +335,11 @@ fn create_schema(
     Ok(Schema::new(fields))
 }
 
+fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef {
+    let group_fields = schema.fields()[0..group_count].to_vec();
+    Arc::new(Schema::new(group_fields))
+}
+
 /// returns physical expressions to evaluate against a batch
 /// The expressions are different depending on `mode`:
 /// * Partial: AggregateExpr::expressions
@@ -376,7 +398,7 @@ fn create_accumulators(
         .collect::<datafusion_common::Result<Vec<_>>>()
 }
 
-fn check_accumulator_v2_supported(aggr_expr: &[Arc<dyn AggregateExpr>]) -> bool {
+fn accumulator_v2_supported(aggr_expr: &[Arc<dyn AggregateExpr>]) -> bool {
     aggr_expr.iter().all(|expr| expr.accumulator_v2_supported())
 }
 
diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
index 2da8e39d42d8a..0ee46c880914b 100644
--- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs
+++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
@@ -29,7 +29,7 @@ use futures::{
 
 use crate::error::Result;
 use crate::physical_plan::aggregates::{
-    evaluate, evaluate_many, AccumulatorItemV2, AggregateMode,
+    evaluate, evaluate_many, group_schema, AccumulatorItemV2, AggregateMode,
 };
 use crate::physical_plan::hash_utils::create_row_hashes;
 use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput};
@@ -52,60 +52,38 @@ use datafusion_row::writer::{write_row, RowWriter};
 use datafusion_row::{MutableRecordBatch, RowType};
 use hashbrown::raw::RawTable;
 
-/*
-The architecture is the following:
-
-1. An accumulator has state that is updated on each batch.
-2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row
-3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch.
-4. The state's RecordBatch is `merge`d to a new state
-5. The state is mapped to the final value
-
-Why:
-
-* Accumulators' state can be statically typed, but it is more efficient to transmit data from the accumulators via `Array`
-* The `merge` operation must have access to the state of the aggregators because it uses it to correctly merge
-* It uses Arrow's native dynamically typed object, `Array`.
-* Arrow shines in batch operations and both `merge` and `concatenate` of uniform types are very performant.
-
-Example: average
-
-* the state is `n: u32` and `sum: f64`
-* For every batch, we update them accordingly.
-* At the end of the accumulation (of a partition), we convert `n` and `sum` to a RecordBatch of 1 row and two columns: `[n, sum]`
-* The RecordBatch is (sent back / transmitted over network)
-* Once all N record batches arrive, `merge` is performed, which builds a RecordBatch with N rows and 2 columns.
-* Finally, `get_value` returns an array with one entry computed from the state
-*/
-pub(crate) struct GroupedRowHashAggregateStream {
+/// Grouping aggregate with row format to store the aggregation state.
+///
+/// The Architecture is similar to that in [`super::GroupedHashAggregateStream`] but use
+/// row format inside the HashTable to store aggregation buffers.
+pub(crate) struct GroupedHashAggregateStreamV2 {
     schema: SchemaRef,
     input: SendableRecordBatchStream,
     mode: AggregateMode,
-    accumulators: Accumulators,
+    aggr_state: AggregationState,
     aggregate_expressions: Vec<Vec<Arc<dyn PhysicalExpr>>>,
 
     group_expr: Vec<Arc<dyn PhysicalExpr>>,
-    accs_v2: Vec<AccumulatorItemV2>,
+    accumulators: Vec<AccumulatorItemV2>,
 
     group_schema: SchemaRef,
     aggr_schema: SchemaRef,
     aggr_layout: RowLayout,
-    aggr_buffer_width: usize,
 
     baseline_metrics: BaselineMetrics,
     random_state: RandomState,
     finished: bool,
 }
 
-fn create_separate_schema(schema: &Schema, group_count: usize) -> (SchemaRef, SchemaRef) {
-    let (group_fields, aggr_fields) = schema.fields().split_at(group_count);
-    (
-        Arc::new(Schema::new(group_fields.to_vec())),
-        Arc::new(Schema::new(aggr_fields.to_vec())),
-    )
+fn aggr_state_schema(aggr_expr: &[Arc<dyn AggregateExpr>]) -> Result<SchemaRef> {
+    let fields = aggr_expr
+        .iter()
+        .flat_map(|expr| expr.state_fields().unwrap().into_iter())
+        .collect::<Vec<_>>();
+    Ok(Arc::new(Schema::new(fields)))
 }
 
-impl GroupedRowHashAggregateStream {
+impl GroupedHashAggregateStreamV2 {
     /// Create a new GroupedRowHashAggregateStream
     pub fn new(
         mode: AggregateMode,
@@ -123,12 +101,12 @@ impl GroupedRowHashAggregateStream {
         let aggregate_expressions =
             aggregates::aggregate_expressions(&aggr_expr, &mode, group_expr.len())?;
 
-        let accs_v2 = aggregates::create_accumulators_v2(&aggr_expr)?;
+        let accumulators = aggregates::create_accumulators_v2(&aggr_expr)?;
+
+        let group_schema = group_schema(&schema, group_expr.len());
+        let aggr_schema = aggr_state_schema(&aggr_expr)?;
 
-        let (group_schema, aggr_schema) =
-            create_separate_schema(&schema, group_expr.len());
         let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned);
-        let aggr_buffer_width = aggr_layout.fixed_part_width();
         timer.done();
 
         Ok(Self {
@@ -136,21 +114,20 @@ impl GroupedRowHashAggregateStream {
             mode,
             input,
             group_expr,
-            accs_v2,
+            accumulators,
             group_schema,
             aggr_schema,
             aggr_layout,
-            aggr_buffer_width,
             baseline_metrics,
             aggregate_expressions,
-            accumulators: Default::default(),
+            aggr_state: Default::default(),
             random_state: Default::default(),
             finished: false,
         })
     }
 }
 
-impl Stream for GroupedRowHashAggregateStream {
+impl Stream for GroupedHashAggregateStreamV2 {
     type Item = ArrowResult<RecordBatch>;
 
     fn poll_next(
@@ -172,12 +149,11 @@ impl Stream for GroupedRowHashAggregateStream {
                         &this.mode,
                         &this.random_state,
                         &this.group_expr,
-                        &mut this.accs_v2,
+                        &mut this.accumulators,
                         &this.group_schema,
                         &this.aggr_layout,
-                        this.aggr_buffer_width,
                         batch,
-                        &mut this.accumulators,
+                        &mut this.aggr_state,
                         &this.aggregate_expressions,
                     );
 
@@ -196,8 +172,8 @@ impl Stream for GroupedRowHashAggregateStream {
                         &this.mode,
                         &this.group_schema,
                         &this.aggr_schema,
+                        &mut this.aggr_state,
                         &mut this.accumulators,
-                        &mut this.accs_v2,
                         &this.schema,
                     )
                     .record_output(&this.baseline_metrics);
@@ -213,23 +189,23 @@ impl Stream for GroupedRowHashAggregateStream {
     }
 }
 
-impl RecordBatchStream for GroupedRowHashAggregateStream {
+impl RecordBatchStream for GroupedHashAggregateStreamV2 {
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
 }
 
-/// TODO: Make this a member function of [`GroupedRowHashAggregateStream`]
+/// TODO: Make this a member function of [`GroupedHashAggregateStreamV2`]
+#[allow(clippy::too_many_arguments)]
 fn group_aggregate_batch(
     mode: &AggregateMode,
     random_state: &RandomState,
     group_expr: &[Arc<dyn PhysicalExpr>],
-    accs_v2: &mut [AccumulatorItemV2],
+    accumulators: &mut [AccumulatorItemV2],
     group_schema: &Schema,
     state_layout: &RowLayout,
-    aggr_buffer_width: usize,
     batch: RecordBatch,
-    accumulators: &mut Accumulators,
+    aggr_state: &mut AggregationState,
     aggregate_expressions: &[Vec<Arc<dyn PhysicalExpr>>],
 ) -> Result<()> {
     // evaluate the grouping expressions
@@ -245,7 +221,7 @@ fn group_aggregate_batch(
     // 1.2 construct the mapping key if it does not exist
     // 1.3 add the row' index to `indices`
 
-    // track which entries in `accumulators` have rows in this batch to aggregate
+    // track which entries in `aggr_state` have rows in this batch to aggregate
     let mut groups_with_rows = vec![];
 
     // 1.1 Calculate the group keys for the group values
@@ -253,7 +229,7 @@ fn group_aggregate_batch(
     create_row_hashes(&group_rows, random_state, &mut batch_hashes)?;
 
     for (row, hash) in batch_hashes.into_iter().enumerate() {
-        let Accumulators { map, group_states } = accumulators;
+        let AggregationState { map, group_states } = aggr_state;
 
         let entry = map.get_mut(hash, |(_hash, group_idx)| {
             // verify that a group that we are inserting with hash is
@@ -278,7 +254,7 @@ fn group_aggregate_batch(
                 // Add new entry to group_states and save newly created index
                 let group_state = RowGroupState {
                     group_by_values: group_rows[row].clone(),
-                    aggregation_buffer: Vec::with_capacity(aggr_buffer_width),
+                    aggregation_buffer: vec![0; state_layout.fixed_part_width()],
                     indices: vec![row as u32], // 1.3
                 };
                 let group_idx = group_states.len();
@@ -296,7 +272,7 @@ fn group_aggregate_batch(
     let mut offsets = vec![0];
     let mut offset_so_far = 0;
     for group_idx in groups_with_rows.iter() {
-        let indices = &accumulators.group_states[*group_idx].indices;
+        let indices = &aggr_state.group_states[*group_idx].indices;
         batch_indices.append_slice(indices)?;
         offset_so_far += indices.len();
         offsets.push(offset_so_far);
@@ -331,9 +307,9 @@ fn group_aggregate_batch(
         .iter()
         .zip(offsets.windows(2))
         .try_for_each(|(group_idx, offsets)| {
-            let group_state = &mut accumulators.group_states[*group_idx];
+            let group_state = &mut aggr_state.group_states[*group_idx];
             // 2.2
-            accs_v2
+            accumulators
                 .iter_mut()
                 .zip(values.iter())
                 .map(|(accumulator, aggr_array)| {
@@ -389,7 +365,7 @@ struct RowGroupState {
 
 /// The state of all the groups
 #[derive(Default)]
-struct Accumulators {
+struct AggregationState {
     /// Logically maps group values to an index in `group_states`
     ///
     /// Uses the raw API of hashbrown to avoid actually storing the
@@ -403,7 +379,7 @@ struct Accumulators {
     group_states: Vec<RowGroupState>,
 }
 
-impl std::fmt::Debug for Accumulators {
+impl std::fmt::Debug for AggregationState {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         // hashes are not store inline, so could only get values
         let map_string = "RawTable";
@@ -431,17 +407,17 @@ fn create_batch_from_map(
     mode: &AggregateMode,
     group_schema: &Schema,
     aggr_schema: &Schema,
-    accumulators: &mut Accumulators,
-    accs_v2: &mut [AccumulatorItemV2],
+    aggr_state: &mut AggregationState,
+    accumulators: &mut [AccumulatorItemV2],
     output_schema: &Schema,
 ) -> ArrowResult<RecordBatch> {
-    if accumulators.group_states.is_empty() {
+    if aggr_state.group_states.is_empty() {
         return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned())));
     }
 
     let mut state_accessor = RowAccessor::new(aggr_schema, RowType::WordAligned);
 
-    let (group_buffers, mut state_buffers): (Vec<_>, Vec<_>) = accumulators
+    let (group_buffers, mut state_buffers): (Vec<_>, Vec<_>) = aggr_state
         .group_states
         .iter()
         .map(|gs| (gs.group_by_values.clone(), gs.aggregation_buffer.clone()))
@@ -457,10 +433,10 @@ fn create_batch_from_map(
             RowType::WordAligned,
         )),
         AggregateMode::Final | AggregateMode::FinalPartitioned => {
-            let mut results: Vec<Vec<ScalarValue>> = vec![vec![]; accs_v2.len()];
+            let mut results: Vec<Vec<ScalarValue>> = vec![vec![]; accumulators.len()];
             for buffer in state_buffers.iter_mut() {
                 state_accessor.point_to(0, buffer);
-                for (i, acc) in accs_v2.iter().enumerate() {
+                for (i, acc) in accumulators.iter().enumerate() {
                     results[i].push(acc.evaluate(&state_accessor).unwrap());
                 }
             }
@@ -485,7 +461,7 @@ fn create_batch_from_map(
 fn read_as_batch(rows: &[Vec<u8>], schema: &Schema, row_type: RowType) -> Vec<ArrayRef> {
     let row_num = rows.len();
     let mut output = MutableRecordBatch::new(row_num, Arc::new(schema.clone()));
-    let mut row = RowReader::new(&schema, row_type);
+    let mut row = RowReader::new(schema, row_type);
 
     for data in rows {
         row.point_to(0, data);
diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs
index 3c0207a863cf7..65099a79e0913 100644
--- a/datafusion/core/src/physical_plan/hash_utils.rs
+++ b/datafusion/core/src/physical_plan/hash_utils.rs
@@ -265,7 +265,7 @@ pub fn create_hashes<'a>(
     for hash in hashes_buffer.iter_mut() {
         *hash = 0
     }
-    return Ok(hashes_buffer);
+    Ok(hashes_buffer)
 }
 
 /// Test version of `create_row_hashes` that produces the same value for
@@ -281,7 +281,7 @@ pub fn create_row_hashes<'a>(
     for hash in hashes_buffer.iter_mut() {
         *hash = 0
     }
-    return Ok(hashes_buffer);
+    Ok(hashes_buffer)
 }
 
 /// Test version of `create_row_hashes` that produces the same value for
@@ -300,7 +300,7 @@ pub fn create_row_hashes<'a>(
     for (i, hash) in hashes_buffer.iter_mut().enumerate() {
         *hash = <Vec<u8>>::get_hash(&rows[i], random_state);
     }
-    return Ok(hashes_buffer);
+    Ok(hashes_buffer)
 }
 
 /// Creates hash values for every row, based on the values in the
diff --git a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs
index dc8345064dcff..d2ebf12f710c8 100644
--- a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs
+++ b/datafusion/physical-expr/src/aggregate/accumulator_v2.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Accumulator in raw format
+//! Accumulator over row format
 
 use arrow::array::ArrayRef;
 use datafusion_common::{Result, ScalarValue};
diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs
index 2847502e77caf..42960c9a76b78 100644
--- a/datafusion/physical-expr/src/aggregate/average.rs
+++ b/datafusion/physical-expr/src/aggregate/average.rs
@@ -217,9 +217,11 @@ impl AccumulatorV2 for AvgAccumulatorV2 {
         accessor: &mut RowAccessor,
     ) -> Result<()> {
         let values = &values[0];
-
+        // count
         let delta = (values.len() - values.data().null_count()) as u64;
         accessor.add_u64(self.start_index, delta);
+
+        // sum
         sum::add_to_row(
             &self.sum_datatype,
             self.start_index + 1,
@@ -235,9 +237,11 @@ impl AccumulatorV2 for AvgAccumulatorV2 {
         accessor: &mut RowAccessor,
     ) -> Result<()> {
         let counts = states[0].as_any().downcast_ref::<UInt64Array>().unwrap();
+        // count
         let delta = compute::sum(counts).unwrap_or(0);
         accessor.add_u64(self.start_index, delta);
 
+        // sum
         sum::add_to_row(
             &self.sum_datatype,
             self.start_index + 1,
diff --git a/datafusion/row/src/layout.rs b/datafusion/row/src/layout.rs
index adbe67ea52df9..0c92025a74f4c 100644
--- a/datafusion/row/src/layout.rs
+++ b/datafusion/row/src/layout.rs
@@ -57,7 +57,12 @@ pub struct RowLayout {
 impl RowLayout {
     /// new
     pub fn new(schema: &Schema, row_type: RowType) -> Self {
-        assert!(row_supported(schema, row_type));
+        assert!(
+            row_supported(schema, row_type),
+            "{:?}Row with {:?} not supported yet.",
+            row_type,
+            schema,
+        );
         let null_free = schema_null_free(schema);
         let field_count = schema.fields().len();
         let null_width = if null_free {
@@ -151,7 +156,7 @@ pub(crate) fn estimate_row_width(schema: &Schema, layout: &RowLayout) -> usize {
 
 /// Tell if we can create raw-bytes based rows since we currently
 /// has limited data type supports in the row format
-fn row_supported(schema: &Schema, row_type: RowType) -> bool {
+pub fn row_supported(schema: &Schema, row_type: RowType) -> bool {
     schema
         .fields()
         .iter()
diff --git a/datafusion/row/src/lib.rs b/datafusion/row/src/lib.rs
index c31bf751a1190..c05cbcd0ef1c4 100644
--- a/datafusion/row/src/lib.rs
+++ b/datafusion/row/src/lib.rs
@@ -51,6 +51,7 @@ use arrow::array::{make_builder, ArrayBuilder, ArrayRef};
 use arrow::datatypes::Schema;
 use arrow::error::Result as ArrowResult;
 use arrow::record_batch::RecordBatch;
+pub use layout::row_supported;
 pub use layout::RowType;
 use std::sync::Arc;
 
@@ -350,7 +351,7 @@ mod tests {
     );
 
     #[test]
-    #[should_panic(expected = "row_supported(schema, row_type)")]
+    #[should_panic(expected = "not supported yet")]
     fn test_unsupported_word_aligned_type() {
         let a: ArrayRef = Arc::new(StringArray::from(vec!["hello", "world"]));
         let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap();
@@ -389,7 +390,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "row_supported(schema, row_type)")]
+    #[should_panic(expected = "not supported yet")]
     fn test_unsupported_type_write() {
         let a: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
         let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap();
@@ -399,7 +400,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "row_supported(schema, row_type)")]
+    #[should_panic(expected = "not supported yet")]
     fn test_unsupported_type_read() {
         let schema = Arc::new(Schema::new(vec![Field::new(
             "a",

From 1cf0ba56425e00dac5e1cb81cf1d50c783ce2c72 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 5 May 2022 19:18:23 +0800
Subject: [PATCH 6/8] make row non-optional

---
 datafusion/core/Cargo.toml                     |  9 +++------
 datafusion/core/benches/aggregate_query_sql.rs | 10 ++++++++++
 datafusion/core/src/lib.rs                     |  1 -
 datafusion/physical-expr/Cargo.toml            |  5 ++---
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 510c385733706..e11e02e95bdf1 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -41,15 +41,13 @@ path = "src/lib.rs"
 # Used to enable the avro format
 avro = ["avro-rs", "num-traits", "datafusion-common/avro"]
 crypto_expressions = ["datafusion-physical-expr/crypto_expressions"]
-default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"]
+default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
 force_hash_collisions = []
 # Used to enable JIT code generation
 jit = ["datafusion-jit"]
 pyarrow = ["pyo3", "arrow/pyarrow", "datafusion-common/pyarrow"]
 regex_expressions = ["datafusion-physical-expr/regex_expressions"]
-# Used to enable row format experiment
-row = ["datafusion-row"]
 # Used to enable scheduler
 scheduler = ["rayon"]
 simd = ["arrow/simd"]
@@ -66,7 +64,7 @@ datafusion-data-access = { path = "../../data-access", version = "1.0.0" }
 datafusion-expr = { path = "../expr", version = "7.0.0" }
 datafusion-jit = { path = "../jit", version = "7.0.0", optional = true }
 datafusion-physical-expr = { path = "../physical-expr", version = "7.0.0" }
-datafusion-row = { path = "../row", version = "7.0.0", optional = true }
+datafusion-row = { path = "../row", version = "7.0.0" }
 futures = "0.3"
 hashbrown = { version = "0.12", features = ["raw"] }
 lazy_static = { version = "^1.4.0" }
@@ -134,8 +132,7 @@ name = "sql_planner"
 [[bench]]
 harness = false
 name = "jit"
-required-features = ["row", "jit"]
+required-features = ["jit"]
 
 [[test]]
 name = "row"
-required-features = ["row"]
diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
index 807e64ff5e273..8570f81700c50 100644
--- a/datafusion/core/benches/aggregate_query_sql.rs
+++ b/datafusion/core/benches/aggregate_query_sql.rs
@@ -133,6 +133,16 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
+
+    c.bench_function("aggregate_query_group_by_u64_multiple_keys", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                "SELECT u64_wide, utf8, MIN(f64), AVG(f64), COUNT(f64) \
+                 FROM t GROUP BY u64_wide, utf8",
+            )
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index c598d9a33cefb..b553c0ed84b53 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -233,7 +233,6 @@ pub use datafusion_data_access;
 pub use datafusion_expr as logical_expr;
 pub use datafusion_physical_expr as physical_expr;
 
-#[cfg(feature = "row")]
 pub use datafusion_row as row;
 
 #[cfg(feature = "jit")]
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index ba4d11c0be6a4..d64ecb07b7142 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -34,9 +34,8 @@ path = "src/lib.rs"
 
 [features]
 crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
-default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"]
+default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
 regex_expressions = ["regex"]
-row = ["datafusion-row"]
 unicode_expressions = ["unicode-segmentation"]
 
 [dependencies]
@@ -47,7 +46,7 @@ blake3 = { version = "1.0", optional = true }
 chrono = { version = "0.4", default-features = false }
 datafusion-common = { path = "../common", version = "7.0.0" }
 datafusion-expr = { path = "../expr", version = "7.0.0" }
-datafusion-row = { path = "../row", version = "7.0.0", optional = true }
+datafusion-row = { path = "../row", version = "7.0.0" }
 hashbrown = { version = "0.12", features = ["raw"] }
 lazy_static = { version = "^1.4.0" }
 md-5 = { version = "^0.10.0", optional = true }

From c8b4833a2cb83b5b70c78d9082acfbc60eb3213f Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 6 May 2022 19:12:51 +0800
Subject: [PATCH 7/8] address comments, add docs, part fix #2455

---
 .../core/src/physical_plan/aggregates/mod.rs  | 16 ++++--
 .../src/physical_plan/aggregates/row_hash.rs  | 26 +++++++---
 .../core/src/physical_plan/hash_utils.rs      |  5 +-
 datafusion/core/tests/sql/aggregates.rs       | 22 ++++++++
 datafusion/core/tests/sql/functions.rs        |  4 +-
 .../physical-expr/src/aggregate/average.rs    | 51 +++++++++++--------
 .../physical-expr/src/aggregate/count.rs      | 31 ++++++-----
 .../physical-expr/src/aggregate/min_max.rs    | 40 +++++++++------
 datafusion/physical-expr/src/aggregate/mod.rs | 15 ++++--
 .../{accumulator_v2.rs => row_accumulator.rs} | 25 ++++++++-
 datafusion/physical-expr/src/aggregate/sum.rs | 40 ++++++++++-----
 datafusion/row/src/accessor.rs                |  7 +--
 12 files changed, 193 insertions(+), 89 deletions(-)
 rename datafusion/physical-expr/src/aggregate/{accumulator_v2.rs => row_accumulator.rs} (54%)

diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs
index 8e6f0c4c1b44d..abe20cdcbc94e 100644
--- a/datafusion/core/src/physical_plan/aggregates/mod.rs
+++ b/datafusion/core/src/physical_plan/aggregates/mod.rs
@@ -45,7 +45,7 @@ mod row_hash;
 
 use crate::physical_plan::aggregates::row_hash::GroupedHashAggregateStreamV2;
 pub use datafusion_expr::AggregateFunction;
-use datafusion_physical_expr::aggregate::accumulator_v2::AccumulatorV2;
+use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator;
 pub use datafusion_physical_expr::expressions::create_aggregate_expr;
 use datafusion_row::{row_supported, RowType};
 
@@ -387,7 +387,7 @@ fn merge_expressions(
 }
 
 pub(crate) type AccumulatorItem = Box<dyn Accumulator>;
-pub(crate) type AccumulatorItemV2 = Box<dyn AccumulatorV2>;
+pub(crate) type AccumulatorItemV2 = Box<dyn RowAccumulator>;
 
 fn create_accumulators(
     aggr_expr: &[Arc<dyn AggregateExpr>],
@@ -399,16 +399,22 @@ fn create_accumulators(
 }
 
 fn accumulator_v2_supported(aggr_expr: &[Arc<dyn AggregateExpr>]) -> bool {
-    aggr_expr.iter().all(|expr| expr.accumulator_v2_supported())
+    aggr_expr
+        .iter()
+        .all(|expr| expr.row_accumulator_supported())
 }
 
 fn create_accumulators_v2(
     aggr_expr: &[Arc<dyn AggregateExpr>],
 ) -> datafusion_common::Result<Vec<AccumulatorItemV2>> {
+    let mut state_index = 0;
     aggr_expr
         .iter()
-        .enumerate()
-        .map(|(idx, expr)| expr.create_accumulator_v2(idx))
+        .map(|expr| {
+            let result = expr.create_row_accumulator(state_index);
+            state_index += expr.state_fields().unwrap().len();
+            result
+        })
         .collect::<datafusion_common::Result<Vec<_>>>()
 }
 
diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
index 0ee46c880914b..eac1590dbb110 100644
--- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs
+++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
@@ -52,10 +52,22 @@ use datafusion_row::writer::{write_row, RowWriter};
 use datafusion_row::{MutableRecordBatch, RowType};
 use hashbrown::raw::RawTable;
 
-/// Grouping aggregate with row format to store the aggregation state.
+/// Grouping aggregate with row-format aggregation states inside.
 ///
-/// The Architecture is similar to that in [`super::GroupedHashAggregateStream`] but use
-/// row format inside the HashTable to store aggregation buffers.
+/// For each aggregation entry, we use:
+/// - [Compact] row represents grouping keys for fast hash computation and comparison directly on raw bytes.
+/// - [WordAligned] row to store aggregation state, designed to be CPU-friendly when updates over every field are often.
+///
+/// The architecture is the following:
+///
+/// 1. For each input RecordBatch, update aggregation states corresponding to all appeared grouping keys.
+/// 2. At the end of the aggregation (e.g. end of batches in a partition), the accumulator converts its state to a RecordBatch of a single row
+/// 3. The RecordBatches of all accumulators are merged (`concatenate` in `rust/arrow`) together to a single RecordBatch.
+/// 4. The state's RecordBatch is `merge`d to a new state
+/// 5. The state is mapped to the final value
+///
+/// [Compact]: datafusion_row::layout::RowType::Compact
+/// [WordAligned]: datafusion_row::layout::RowType::WordAligned
 pub(crate) struct GroupedHashAggregateStreamV2 {
     schema: SchemaRef,
     input: SendableRecordBatchStream,
@@ -68,7 +80,7 @@ pub(crate) struct GroupedHashAggregateStreamV2 {
 
     group_schema: SchemaRef,
     aggr_schema: SchemaRef,
-    aggr_layout: RowLayout,
+    aggr_layout: Arc<RowLayout>,
 
     baseline_metrics: BaselineMetrics,
     random_state: RandomState,
@@ -106,7 +118,7 @@ impl GroupedHashAggregateStreamV2 {
         let group_schema = group_schema(&schema, group_expr.len());
         let aggr_schema = aggr_state_schema(&aggr_expr)?;
 
-        let aggr_layout = RowLayout::new(&aggr_schema, RowType::WordAligned);
+        let aggr_layout = Arc::new(RowLayout::new(&aggr_schema, RowType::WordAligned));
         timer.done();
 
         Ok(Self {
@@ -151,7 +163,7 @@ impl Stream for GroupedHashAggregateStreamV2 {
                         &this.group_expr,
                         &mut this.accumulators,
                         &this.group_schema,
-                        &this.aggr_layout,
+                        this.aggr_layout.clone(),
                         batch,
                         &mut this.aggr_state,
                         &this.aggregate_expressions,
@@ -203,7 +215,7 @@ fn group_aggregate_batch(
     group_expr: &[Arc<dyn PhysicalExpr>],
     accumulators: &mut [AccumulatorItemV2],
     group_schema: &Schema,
-    state_layout: &RowLayout,
+    state_layout: Arc<RowLayout>,
     batch: RecordBatch,
     aggr_state: &mut AggregationState,
     aggregate_expressions: &[Vec<Arc<dyn PhysicalExpr>>],
diff --git a/datafusion/core/src/physical_plan/hash_utils.rs b/datafusion/core/src/physical_plan/hash_utils.rs
index 65099a79e0913..e68623be93c59 100644
--- a/datafusion/core/src/physical_plan/hash_utils.rs
+++ b/datafusion/core/src/physical_plan/hash_utils.rs
@@ -284,10 +284,7 @@ pub fn create_row_hashes<'a>(
     Ok(hashes_buffer)
 }
 
-/// Test version of `create_row_hashes` that produces the same value for
-/// all hashes (to test collisions)
-///
-/// See comments on `hashes_buffer` for more details
+/// Creates hash values for every row, based on their raw bytes.
 #[cfg(not(feature = "force_hash_collisions"))]
 pub fn create_row_hashes<'a>(
     rows: &[Vec<u8>],
diff --git a/datafusion/core/tests/sql/aggregates.rs b/datafusion/core/tests/sql/aggregates.rs
index b488e880dcf83..d8ec9e0167e60 100644
--- a/datafusion/core/tests/sql/aggregates.rs
+++ b/datafusion/core/tests/sql/aggregates.rs
@@ -652,6 +652,28 @@ async fn csv_query_array_agg_one() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn csv_query_array_agg_with_overflow() -> Result<()> {
+    let ctx = SessionContext::new();
+    register_aggregate_csv(&ctx).await?;
+    let sql =
+        "select c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by c2 order by c2";
+    let actual = execute_to_batches(&ctx, sql).await;
+    let expected = vec![
+        "+----+--------+---------------------+--------+--------+----------+",
+        "| c2 | sum_c3 | avg_c3              | max_c3 | min_c3 | count_c3 |",
+        "+----+--------+---------------------+--------+--------+----------+",
+        "| 1  | 367    | 16.681818181818183  | 125    | -99    | 22       |",
+        "| 2  | 184    | 8.363636363636363   | 122    | -117   | 22       |",
+        "| 3  | 395    | 20.789473684210527  | 123    | -101   | 19       |",
+        "| 4  | 29     | 1.2608695652173914  | 123    | -117   | 23       |",
+        "| 5  | -194   | -13.857142857142858 | 118    | -101   | 14       |",
+        "+----+--------+---------------------+--------+--------+----------+",
+    ];
+    assert_batches_eq!(expected, &actual);
+    Ok(())
+}
+
 #[tokio::test]
 async fn csv_query_array_agg_distinct() -> Result<()> {
     let ctx = SessionContext::new();
diff --git a/datafusion/core/tests/sql/functions.rs b/datafusion/core/tests/sql/functions.rs
index 857781aa35a3c..59236c467fd64 100644
--- a/datafusion/core/tests/sql/functions.rs
+++ b/datafusion/core/tests/sql/functions.rs
@@ -17,7 +17,6 @@
 
 use super::*;
 
-/// sqrt(f32) is slightly different than sqrt(CAST(f32 AS double)))
 #[tokio::test]
 async fn sqrt_f32_vs_f64() -> Result<()> {
     let ctx = create_ctx()?;
@@ -25,7 +24,8 @@ async fn sqrt_f32_vs_f64() -> Result<()> {
     // sqrt(f32)'s plan passes
     let sql = "SELECT avg(sqrt(c11)) FROM aggregate_test_100";
     let actual = execute(&ctx, sql).await;
-    let expected = vec![vec!["0.6584407806396484"]];
+    let sql = "SELECT avg(CAST(sqrt(c11) AS double)) FROM aggregate_test_100";
+    let expected = execute(&ctx, sql).await;
 
     assert_eq!(actual, expected);
     let sql = "SELECT avg(sqrt(CAST(c11 AS double))) FROM aggregate_test_100";
diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs
index 42960c9a76b78..3eee84bb5f508 100644
--- a/datafusion/physical-expr/src/aggregate/average.rs
+++ b/datafusion/physical-expr/src/aggregate/average.rs
@@ -21,7 +21,7 @@ use std::any::Any;
 use std::convert::TryFrom;
 use std::sync::Arc;
 
-use crate::aggregate::accumulator_v2::AccumulatorV2;
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::aggregate::sum;
 use crate::expressions::format_state_name;
 use crate::{AggregateExpr, PhysicalExpr};
@@ -104,7 +104,7 @@ impl AggregateExpr for Avg {
         &self.name
     }
 
-    fn accumulator_v2_supported(&self) -> bool {
+    fn row_accumulator_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -120,11 +120,11 @@ impl AggregateExpr for Avg {
         )
     }
 
-    fn create_accumulator_v2(
+    fn create_row_accumulator(
         &self,
         start_index: usize,
-    ) -> Result<Box<dyn AccumulatorV2>> {
-        Ok(Box::new(AvgAccumulatorV2::new(
+    ) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(AvgRowAccumulator::new(
             start_index,
             self.data_type.clone(),
         )))
@@ -158,7 +158,10 @@ impl Accumulator for AvgAccumulator {
         let values = &values[0];
 
         self.count += (values.len() - values.data().null_count()) as u64;
-        self.sum = sum::sum(&self.sum, &sum::sum_batch(values)?)?;
+        self.sum = sum::sum(
+            &self.sum,
+            &sum::sum_batch(values, &self.sum.get_datatype())?,
+        )?;
         Ok(())
     }
 
@@ -168,7 +171,10 @@ impl Accumulator for AvgAccumulator {
         self.count += compute::sum(counts).unwrap_or(0);
 
         // sums are summed
-        self.sum = sum::sum(&self.sum, &sum::sum_batch(&states[1])?)?;
+        self.sum = sum::sum(
+            &self.sum,
+            &sum::sum_batch(&states[1], &self.sum.get_datatype())?,
+        )?;
         Ok(())
     }
 
@@ -196,21 +202,21 @@ impl Accumulator for AvgAccumulator {
 }
 
 #[derive(Debug)]
-struct AvgAccumulatorV2 {
-    start_index: usize,
+struct AvgRowAccumulator {
+    state_index: usize,
     sum_datatype: DataType,
 }
 
-impl AvgAccumulatorV2 {
+impl AvgRowAccumulator {
     pub fn new(start_index: usize, sum_datatype: DataType) -> Self {
         Self {
-            start_index,
+            state_index: start_index,
             sum_datatype,
         }
     }
 }
 
-impl AccumulatorV2 for AvgAccumulatorV2 {
+impl RowAccumulator for AvgRowAccumulator {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -219,14 +225,14 @@ impl AccumulatorV2 for AvgAccumulatorV2 {
         let values = &values[0];
         // count
         let delta = (values.len() - values.data().null_count()) as u64;
-        accessor.add_u64(self.start_index, delta);
+        accessor.add_u64(self.state_index(), delta);
 
         // sum
         sum::add_to_row(
             &self.sum_datatype,
-            self.start_index + 1,
+            self.state_index() + 1,
             accessor,
-            &sum::sum_batch(values)?,
+            &sum::sum_batch(values, &self.sum_datatype)?,
         )?;
         Ok(())
     }
@@ -239,30 +245,35 @@ impl AccumulatorV2 for AvgAccumulatorV2 {
         let counts = states[0].as_any().downcast_ref::<UInt64Array>().unwrap();
         // count
         let delta = compute::sum(counts).unwrap_or(0);
-        accessor.add_u64(self.start_index, delta);
+        accessor.add_u64(self.state_index(), delta);
 
         // sum
         sum::add_to_row(
             &self.sum_datatype,
-            self.start_index + 1,
+            self.state_index() + 1,
             accessor,
-            &sum::sum_batch(&states[1])?,
+            &sum::sum_batch(&states[1], &self.sum_datatype)?,
         )?;
         Ok(())
     }
 
     fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
         assert_eq!(self.sum_datatype, DataType::Float64);
-        Ok(match accessor.get_u64_opt(self.start_index) {
+        Ok(match accessor.get_u64_opt(self.state_index()) {
             None => ScalarValue::Float64(None),
             Some(0) => ScalarValue::Float64(Some(0.0)),
             Some(n) => ScalarValue::Float64(
                 accessor
-                    .get_f64_opt(self.start_index + 1)
+                    .get_f64_opt(self.state_index() + 1)
                     .map(|f| f / n as f64),
             ),
         })
     }
+
+    #[inline(always)]
+    fn state_index(&self) -> usize {
+        self.state_index
+    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/physical-expr/src/aggregate/count.rs b/datafusion/physical-expr/src/aggregate/count.rs
index 9ccd13d5753ed..54bec05d72f0a 100644
--- a/datafusion/physical-expr/src/aggregate/count.rs
+++ b/datafusion/physical-expr/src/aggregate/count.rs
@@ -21,7 +21,7 @@ use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use crate::aggregate::accumulator_v2::AccumulatorV2;
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::{AggregateExpr, PhysicalExpr};
 use arrow::compute;
 use arrow::datatypes::DataType;
@@ -96,15 +96,15 @@ impl AggregateExpr for Count {
         &self.name
     }
 
-    fn accumulator_v2_supported(&self) -> bool {
+    fn row_accumulator_supported(&self) -> bool {
         true
     }
 
-    fn create_accumulator_v2(
+    fn create_row_accumulator(
         &self,
         start_index: usize,
-    ) -> Result<Box<dyn AccumulatorV2>> {
-        Ok(Box::new(CountAccumulatorV2::new(start_index)))
+    ) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(CountRowAccumulator::new(start_index)))
     }
 }
 
@@ -146,17 +146,17 @@ impl Accumulator for CountAccumulator {
 }
 
 #[derive(Debug)]
-struct CountAccumulatorV2 {
-    index: usize,
+struct CountRowAccumulator {
+    state_index: usize,
 }
 
-impl CountAccumulatorV2 {
+impl CountRowAccumulator {
     pub fn new(index: usize) -> Self {
-        Self { index }
+        Self { state_index: index }
     }
 }
 
-impl AccumulatorV2 for CountAccumulatorV2 {
+impl RowAccumulator for CountRowAccumulator {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -164,7 +164,7 @@ impl AccumulatorV2 for CountAccumulatorV2 {
     ) -> Result<()> {
         let array = &values[0];
         let delta = (array.len() - array.data().null_count()) as u64;
-        accessor.add_u64(self.index, delta);
+        accessor.add_u64(self.state_index, delta);
         Ok(())
     }
 
@@ -176,13 +176,18 @@ impl AccumulatorV2 for CountAccumulatorV2 {
         let counts = states[0].as_any().downcast_ref::<UInt64Array>().unwrap();
         let delta = &compute::sum(counts);
         if let Some(d) = delta {
-            accessor.add_u64(self.index, *d);
+            accessor.add_u64(self.state_index, *d);
         }
         Ok(())
     }
 
     fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
-        Ok(accessor.get_as_scalar(&DataType::UInt64, self.index))
+        Ok(accessor.get_as_scalar(&DataType::UInt64, self.state_index))
+    }
+
+    #[inline(always)]
+    fn state_index(&self) -> usize {
+        self.state_index
     }
 }
 
diff --git a/datafusion/physical-expr/src/aggregate/min_max.rs b/datafusion/physical-expr/src/aggregate/min_max.rs
index 2062d16c71245..dd2f44b22c075 100644
--- a/datafusion/physical-expr/src/aggregate/min_max.rs
+++ b/datafusion/physical-expr/src/aggregate/min_max.rs
@@ -37,7 +37,7 @@ use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::Accumulator;
 
-use crate::aggregate::accumulator_v2::AccumulatorV2;
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::expressions::format_state_name;
 use arrow::array::Array;
 use arrow::array::DecimalArray;
@@ -114,7 +114,7 @@ impl AggregateExpr for Max {
         &self.name
     }
 
-    fn accumulator_v2_supported(&self) -> bool {
+    fn row_accumulator_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -130,11 +130,11 @@ impl AggregateExpr for Max {
         )
     }
 
-    fn create_accumulator_v2(
+    fn create_row_accumulator(
         &self,
         start_index: usize,
-    ) -> Result<Box<dyn AccumulatorV2>> {
-        Ok(Box::new(MaxAccumulatorV2::new(
+    ) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(MaxRowAccumulator::new(
             start_index,
             self.data_type.clone(),
         )))
@@ -547,18 +547,18 @@ impl Accumulator for MaxAccumulator {
 }
 
 #[derive(Debug)]
-struct MaxAccumulatorV2 {
+struct MaxRowAccumulator {
     index: usize,
     data_type: DataType,
 }
 
-impl MaxAccumulatorV2 {
+impl MaxRowAccumulator {
     pub fn new(index: usize, data_type: DataType) -> Self {
         Self { index, data_type }
     }
 }
 
-impl AccumulatorV2 for MaxAccumulatorV2 {
+impl RowAccumulator for MaxRowAccumulator {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -581,6 +581,11 @@ impl AccumulatorV2 for MaxAccumulatorV2 {
     fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
         Ok(accessor.get_as_scalar(&self.data_type, self.index))
     }
+
+    #[inline(always)]
+    fn state_index(&self) -> usize {
+        self.index
+    }
 }
 
 /// MIN aggregate expression
@@ -642,7 +647,7 @@ impl AggregateExpr for Min {
         &self.name
     }
 
-    fn accumulator_v2_supported(&self) -> bool {
+    fn row_accumulator_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -658,11 +663,11 @@ impl AggregateExpr for Min {
         )
     }
 
-    fn create_accumulator_v2(
+    fn create_row_accumulator(
         &self,
         start_index: usize,
-    ) -> Result<Box<dyn AccumulatorV2>> {
-        Ok(Box::new(MinAccumulatorV2::new(
+    ) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(MinRowAccumulator::new(
             start_index,
             self.data_type.clone(),
         )))
@@ -706,18 +711,18 @@ impl Accumulator for MinAccumulator {
 }
 
 #[derive(Debug)]
-struct MinAccumulatorV2 {
+struct MinRowAccumulator {
     index: usize,
     data_type: DataType,
 }
 
-impl MinAccumulatorV2 {
+impl MinRowAccumulator {
     pub fn new(index: usize, data_type: DataType) -> Self {
         Self { index, data_type }
     }
 }
 
-impl AccumulatorV2 for MinAccumulatorV2 {
+impl RowAccumulator for MinRowAccumulator {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
@@ -740,6 +745,11 @@ impl AccumulatorV2 for MinAccumulatorV2 {
     fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
         Ok(accessor.get_as_scalar(&self.data_type, self.index))
     }
+
+    #[inline(always)]
+    fn state_index(&self) -> usize {
+        self.index
+    }
 }
 
 #[cfg(test)]
diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs
index b9ad768e8eca6..09e8a9b0ac89f 100644
--- a/datafusion/physical-expr/src/aggregate/mod.rs
+++ b/datafusion/physical-expr/src/aggregate/mod.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::aggregate::accumulator_v2::AccumulatorV2;
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::PhysicalExpr;
 use arrow::datatypes::Field;
 use datafusion_common::{Result, ScalarValue};
@@ -38,9 +38,9 @@ pub(crate) mod count_distinct;
 pub(crate) mod covariance;
 #[macro_use]
 pub(crate) mod min_max;
-pub mod accumulator_v2;
 pub mod build_in;
 mod hyperloglog;
+pub mod row_accumulator;
 pub(crate) mod stats;
 pub(crate) mod stddev;
 pub(crate) mod sum;
@@ -81,14 +81,19 @@ pub trait AggregateExpr: Send + Sync + Debug {
     }
 
     /// If the aggregate expression is supported by row format
-    fn accumulator_v2_supported(&self) -> bool {
+    fn row_accumulator_supported(&self) -> bool {
         false
     }
 
-    fn create_accumulator_v2(
+    /// RowAccumulator to access/update row-based aggregation state in-place.
+    /// Currently, row accumulator only supports states of fixed-sized type.
+    ///
+    /// We recommend implementing `RowAccumulator` along with the standard `Accumulator`,
+    /// when its state is of fixed size, as RowAccumulator is more memory efficient and CPU-friendly.
+    fn create_row_accumulator(
         &self,
         _start_index: usize,
-    ) -> Result<Box<dyn AccumulatorV2>> {
+    ) -> Result<Box<dyn RowAccumulator>> {
         unreachable!()
     }
 }
diff --git a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs b/datafusion/physical-expr/src/aggregate/row_accumulator.rs
similarity index 54%
rename from datafusion/physical-expr/src/aggregate/accumulator_v2.rs
rename to datafusion/physical-expr/src/aggregate/row_accumulator.rs
index d2ebf12f710c8..386787454f853 100644
--- a/datafusion/physical-expr/src/aggregate/accumulator_v2.rs
+++ b/datafusion/physical-expr/src/aggregate/row_accumulator.rs
@@ -22,7 +22,27 @@ use datafusion_common::{Result, ScalarValue};
 use datafusion_row::accessor::RowAccessor;
 use std::fmt::Debug;
 
-pub trait AccumulatorV2: Send + Sync + Debug {
+/// Row-based accumulator where the internal aggregate state(s) are stored using row format.
+///
+/// Unlike the [`datafusion_expr::Accumulator`], the [`RowAccumulator`] does not store the state internally.
+/// Instead, it knows how to access/update the state stored in a row via the the provided accessor and
+/// its state's starting field index in the row.
+///
+/// For example, we are evaluating `SELECT a, sum(b), avg(c), count(d) from GROUP BY a;`, we would have one row used as
+/// aggregation state for each distinct `a` value, the index of the first and the only state of `sum(b)` would be 0,
+/// the index of the first state of `avg(c)` would be 1, and the index of the first and only state of `cound(d)` would be 3:
+///
+/// sum(b) state_index = 0              count(d) state_index = 3
+///        |                            |
+///        v                            v
+///        +--------+----------+--------+----------+
+///        | sum(b) | count(c) | sum(c) | count(d) |
+///        +--------+----------+--------+----------+
+///                 ^
+///                 |
+///           avg(c) state_index = 1
+///
+pub trait RowAccumulator: Send + Sync + Debug {
     /// updates the accumulator's state from a vector of arrays.
     fn update_batch(
         &mut self,
@@ -39,4 +59,7 @@ pub trait AccumulatorV2: Send + Sync + Debug {
 
     /// returns its value based on its current state.
     fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue>;
+
+    /// State's starting field index in the row.
+    fn state_index(&self) -> usize;
 }
diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs
index b1928c7031531..c369e7af00813 100644
--- a/datafusion/physical-expr/src/aggregate/sum.rs
+++ b/datafusion/physical-expr/src/aggregate/sum.rs
@@ -34,10 +34,11 @@ use arrow::{
 use datafusion_common::{DataFusionError, Result, ScalarValue};
 use datafusion_expr::Accumulator;
 
-use crate::aggregate::accumulator_v2::AccumulatorV2;
+use crate::aggregate::row_accumulator::RowAccumulator;
 use crate::expressions::format_state_name;
 use arrow::array::Array;
 use arrow::array::DecimalArray;
+use arrow::compute::cast;
 use datafusion_row::accessor::RowAccessor;
 
 /// SUM aggregate expression
@@ -99,7 +100,7 @@ impl AggregateExpr for Sum {
         &self.name
     }
 
-    fn accumulator_v2_supported(&self) -> bool {
+    fn row_accumulator_supported(&self) -> bool {
         matches!(
             self.data_type,
             DataType::UInt8
@@ -115,11 +116,11 @@ impl AggregateExpr for Sum {
         )
     }
 
-    fn create_accumulator_v2(
+    fn create_row_accumulator(
         &self,
         start_index: usize,
-    ) -> Result<Box<dyn AccumulatorV2>> {
-        Ok(Box::new(SumAccumulatorV2::new(
+    ) -> Result<Box<dyn RowAccumulator>> {
+        Ok(Box::new(SumRowAccumulator::new(
             start_index,
             self.data_type.clone(),
         )))
@@ -172,7 +173,8 @@ fn sum_decimal_batch(
 }
 
 // sums the array and returns a ScalarValue of its corresponding type.
-pub(crate) fn sum_batch(values: &ArrayRef) -> Result<ScalarValue> {
+pub(crate) fn sum_batch(values: &ArrayRef, sum_type: &DataType) -> Result<ScalarValue> {
+    let values = &cast(values, sum_type)?;
     Ok(match values.data_type() {
         DataType::Decimal(precision, scale) => {
             sum_decimal_batch(values, precision, scale)?
@@ -439,7 +441,7 @@ impl Accumulator for SumAccumulator {
 
     fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
         let values = &values[0];
-        self.sum = sum(&self.sum, &sum_batch(values)?)?;
+        self.sum = sum(&self.sum, &sum_batch(values, &self.sum.get_datatype())?)?;
         Ok(())
     }
 
@@ -456,25 +458,30 @@ impl Accumulator for SumAccumulator {
 }
 
 #[derive(Debug)]
-struct SumAccumulatorV2 {
+struct SumRowAccumulator {
     index: usize,
     datatype: DataType,
 }
 
-impl SumAccumulatorV2 {
+impl SumRowAccumulator {
     pub fn new(index: usize, datatype: DataType) -> Self {
         Self { index, datatype }
     }
 }
 
-impl AccumulatorV2 for SumAccumulatorV2 {
+impl RowAccumulator for SumRowAccumulator {
     fn update_batch(
         &mut self,
         values: &[ArrayRef],
         accessor: &mut RowAccessor,
     ) -> Result<()> {
         let values = &values[0];
-        add_to_row(&self.datatype, self.index, accessor, &sum_batch(values)?)?;
+        add_to_row(
+            &self.datatype,
+            self.index,
+            accessor,
+            &sum_batch(values, &self.datatype)?,
+        )?;
         Ok(())
     }
 
@@ -489,6 +496,11 @@ impl AccumulatorV2 for SumAccumulatorV2 {
     fn evaluate(&self, accessor: &RowAccessor) -> Result<ScalarValue> {
         Ok(accessor.get_as_scalar(&self.datatype, self.index))
     }
+
+    #[inline(always)]
+    fn state_index(&self) -> usize {
+        self.index
+    }
 }
 
 #[cfg(test)]
@@ -532,7 +544,7 @@ mod tests {
                 .collect::<DecimalArray>()
                 .with_precision_and_scale(10, 0)?,
         );
-        let result = sum_batch(&array)?;
+        let result = sum_batch(&array, &DataType::Decimal(10, 0))?;
         assert_eq!(ScalarValue::Decimal128(Some(15), 10, 0), result);
 
         // test agg
@@ -567,7 +579,7 @@ mod tests {
                 .collect::<DecimalArray>()
                 .with_precision_and_scale(10, 0)?,
         );
-        let result = sum_batch(&array)?;
+        let result = sum_batch(&array, &DataType::Decimal(10, 0))?;
         assert_eq!(ScalarValue::Decimal128(Some(13), 10, 0), result);
 
         // test agg
@@ -601,7 +613,7 @@ mod tests {
                 .collect::<DecimalArray>()
                 .with_precision_and_scale(10, 0)?,
         );
-        let result = sum_batch(&array)?;
+        let result = sum_batch(&array, &DataType::Decimal(10, 0))?;
         assert_eq!(ScalarValue::Decimal128(None, 10, 0), result);
 
         // test agg
diff --git a/datafusion/row/src/accessor.rs b/datafusion/row/src/accessor.rs
index ae5c74b701352..b6ec41d3345bb 100644
--- a/datafusion/row/src/accessor.rs
+++ b/datafusion/row/src/accessor.rs
@@ -23,13 +23,14 @@ use crate::{fn_get_idx, fn_get_idx_opt, fn_set_idx};
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::bit_util::{get_bit_raw, set_bit_raw};
 use datafusion_common::ScalarValue;
+use std::sync::Arc;
 
 //TODO: DRY with reader and writer
 
 /// Read the tuple `data[base_offset..]` we are currently pointing to
 pub struct RowAccessor<'a> {
     /// Layout on how to read each field
-    layout: RowLayout,
+    layout: Arc<RowLayout>,
     /// Raw bytes slice where the tuple stores
     data: &'a mut [u8],
     /// Start position for the current tuple in the raw bytes slice.
@@ -103,13 +104,13 @@ impl<'a> RowAccessor<'a> {
     /// new
     pub fn new(schema: &Schema, row_type: RowType) -> Self {
         Self {
-            layout: RowLayout::new(schema, row_type),
+            layout: Arc::new(RowLayout::new(schema, row_type)),
             data: &mut [],
             base_offset: 0,
         }
     }
 
-    pub fn new_from_layout(layout: RowLayout) -> Self {
+    pub fn new_from_layout(layout: Arc<RowLayout>) -> Self {
         Self {
             layout,
             data: &mut [],

From 7350cebee1d23c6485f0224db61752c68136ad57 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 6 May 2022 19:24:35 +0800
Subject: [PATCH 8/8] Apply suggestions from code review

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/core/src/physical_plan/aggregates/row_hash.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
index eac1590dbb110..e364048e75fda 100644
--- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs
+++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs
@@ -395,9 +395,9 @@ impl std::fmt::Debug for AggregationState {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         // hashes are not store inline, so could only get values
         let map_string = "RawTable";
-        f.debug_struct("RowAccumulators")
+        f.debug_struct("AggregationState")
             .field("map", &map_string)
-            .field("row_group_states", &self.group_states)
+            .field("group_states", &self.group_states)
             .finish()
     }
 }