From f00997267b10049a1560d657dc235c6436a2345d Mon Sep 17 00:00:00 2001 From: zml1206 Date: Fri, 18 Apr 2025 17:12:46 +0800 Subject: [PATCH 1/2] [VL] Change isDistinct of distinct aggregateExpression to false --- .../execution/HashAggregateExecBaseTransformer.scala | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala b/gluten-substrait/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala index a1572073fe9e..61a564ccb78f 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/execution/HashAggregateExecBaseTransformer.scala @@ -193,13 +193,23 @@ object HashAggregateExecBaseTransformer { .genHashAggregateExecTransformer( agg.requiredChildDistributionExpressions, agg.groupingExpressions, - agg.aggregateExpressions, + agg.aggregateExpressions.map(rewriteAggregateExpression), agg.aggregateAttributes, getInitialInputBufferOffset(agg), agg.resultExpressions, agg.child ) } + + // Vanilla spark will add an aggregate to remove duplicates for the distinct aggregation + // function, so velox does not need to process distinct. + def rewriteAggregateExpression(aggregateExpr: AggregateExpression): AggregateExpression = { + if (aggregateExpr.isDistinct) { + aggregateExpr.copy(isDistinct = false) + } else { + aggregateExpr + } + } } trait HashAggregateExecPullOutBaseHelper { From d2b160739eda74bdb17b209fde015df32a392189 Mon Sep 17 00:00:00 2001 From: zml1206 Date: Fri, 18 Apr 2025 17:55:26 +0800 Subject: [PATCH 2/2] update golden files --- .../resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt | 4 ++-- .../resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt | 4 ++-- .../resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt | 4 ++-- .../test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt | 4 ++-- .../test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt | 4 ++-- .../test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt | 4 ++-- .../test/resources/tpch-approved-plan/v1-ras/spark32/16.txt | 4 ++-- .../test/resources/tpch-approved-plan/v1-ras/spark33/16.txt | 4 ++-- .../test/resources/tpch-approved-plan/v1-ras/spark34/16.txt | 4 ++-- .../src/test/resources/tpch-approved-plan/v1/spark32/16.txt | 4 ++-- .../src/test/resources/tpch-approved-plan/v1/spark33/16.txt | 4 ++-- .../src/test/resources/tpch-approved-plan/v1/spark34/16.txt | 4 ++-- 12 files changed, 24 insertions(+), 24 deletions(-) diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt index 10972d6c2f1a..c7580b098b8d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt @@ -145,7 +145,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -178,7 +178,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt index 8cc2b417eb3f..fe95eff2478b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt @@ -145,7 +145,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -178,7 +178,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt index 3a9b17afd60c..f9c76d63b002 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt @@ -146,7 +146,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -179,7 +179,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt index 10972d6c2f1a..c7580b098b8d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt @@ -145,7 +145,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -178,7 +178,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt index 8cc2b417eb3f..fe95eff2478b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt @@ -145,7 +145,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -178,7 +178,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt index 3a9b17afd60c..f9c76d63b002 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt @@ -146,7 +146,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (21) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -179,7 +179,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (29) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt index 2c41822beeeb..b5bd598c9558 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt @@ -189,7 +189,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -222,7 +222,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt index cac61db4a6bd..5f9edf5ef2b6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt @@ -189,7 +189,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -222,7 +222,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt index 3441216f140f..281a2cda5fcb 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt @@ -190,7 +190,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -223,7 +223,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt index 2c41822beeeb..b5bd598c9558 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt @@ -189,7 +189,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -222,7 +222,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt index cac61db4a6bd..5f9edf5ef2b6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt @@ -189,7 +189,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -222,7 +222,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt index 3441216f140f..281a2cda5fcb 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt @@ -190,7 +190,7 @@ Results [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] (30) FlushableHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [partial_count(distinct ps_suppkey#X)] +Functions [1]: [partial_count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -223,7 +223,7 @@ Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] (38) RegularHashAggregateExecTransformer Input [4]: [p_brand#X, p_type#X, p_size#X, count#X] Keys [3]: [p_brand#X, p_type#X, p_size#X] -Functions [1]: [count(distinct ps_suppkey#X)] +Functions [1]: [count(ps_suppkey#X)] Aggregate Attributes [1]: [count(ps_suppkey#X)#X] Results [4]: [p_brand#X, p_type#X, p_size#X, count(ps_suppkey#X)#X AS supplier_cnt#X]