apache · taiyang-li · Apr 3, 2025 · Jan 10, 2025
diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala
@@ -124,6 +124,7 @@ object CHRuleApi {
     injector.injectPostTransform(c => AddPreProjectionForHashJoin.apply(c.session))
     injector.injectPostTransform(c => ReplaceSubStringComparison.apply(c.session))
     injector.injectPostTransform(c => EliminateDeduplicateAggregateWithAnyJoin(c.session))
+    injector.injectPostTransform(c => FlattenNestedExpressions.apply(c.session))
 
     // Gluten columnar: Fallback policies.
     injector.injectFallbackPolicy(c => p => ExpandFallbackPolicy(c.caller.isAqe(), p))

diff --git a/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/...ickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala
@@ -581,7 +581,9 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
     List(
       Sig[CollectList](ExpressionNames.COLLECT_LIST),
       Sig[CollectSet](ExpressionNames.COLLECT_SET),
-      Sig[MonotonicallyIncreasingID](MONOTONICALLY_INCREASING_ID)
+      Sig[MonotonicallyIncreasingID](MONOTONICALLY_INCREASING_ID),
+      CHFlattenedExpression.sigAnd,
+      CHFlattenedExpression.sigOr
     ) ++
       ExpressionExtensionTrait.expressionExtensionSigList ++
       SparkShimLoader.getSparkShims.bloomFilterExpressionMappings()
@@ -947,4 +949,19 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
       outputAttributes: Seq[Attribute],
       child: Seq[SparkPlan]): ColumnarRangeBaseExec =
     CHRangeExecTransformer(start, end, step, numSlices, numElements, outputAttributes, child)
+
+  override def expressionFlattenSupported(expr: Expression): Boolean = expr match {
+    case ca: FlattenedAnd => CHFlattenedExpression.supported(ca.name)
+    case co: FlattenedOr => CHFlattenedExpression.supported(co.name)
+    case _ => false
+  }
+
+  override def genFlattenedExpressionTransformer(
+      substraitName: String,
+      children: Seq[ExpressionTransformer],
+      expr: Expression): ExpressionTransformer = expr match {
+    case ce: FlattenedAnd => GenericExpressionTransformer(ce.name, children, ce)
+    case co: FlattenedOr => GenericExpressionTransformer(co.name, children, co)
+    case _ => super.genFlattenedExpressionTransformer(substraitName, children, expr)
+  }
 }
diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
@@ -16,7 +16,9 @@
  */
 package org.apache.gluten.execution
 
-import org.apache.spark.sql.catalyst.expressions.{And, Expression}
+import org.apache.gluten.expression.CHFlattenedExpression
+
+import org.apache.spark.sql.catalyst.expressions.{And, Expression, ExprId, IsNotNull}
 import org.apache.spark.sql.execution.SparkPlan
 
 case class CHFilterExecTransformer(condition: Expression, child: SparkPlan)
@@ -48,4 +50,13 @@ case class FilterExecTransformer(condition: Expression, child: SparkPlan)
   override protected def getRemainingCondition: Expression = condition
   override protected def withNewChildInternal(newChild: SparkPlan): FilterExecTransformer =
     copy(child = newChild)
+  override protected val notNullAttributes: Seq[ExprId] = condition match {
+    case s: CHFlattenedExpression =>
+      val (notNullPreds, _) = s.children.partition {
+        case IsNotNull(a) => isNullIntolerant(a) && a.references.subsetOf(child.outputSet)
+        case _ => false
+      }
+      notNullPreds.flatMap(_.references).distinct.map(_.exprId)
+    case _ => notNullPreds.flatMap(_.references).distinct.map(_.exprId)
+  }
 }
diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHFlattenedExpression.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/expression/CHFlattenedExpression.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.expression
+
+import org.apache.gluten.config.GlutenConfig
+import org.apache.gluten.exception.GlutenException
+import org.apache.gluten.expression.CHFlattenedExpression.genFlattenedExpression
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
+import org.apache.spark.sql.types.DataType
+
+abstract class CHFlattenedExpression(children: Seq[Expression], name: String) extends Expression {
+
+  def this() = {
+    this(Seq.empty[Expression], "")
+  }
+
+  override def toString: String = s"$name(${children.mkString(", ")})"
+
+  override def eval(input: InternalRow): Any = null
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = null
+
+  override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression =
+    genFlattenedExpression(dataType, newChildren, name, nullable) match {
+      case Some(f) => f
+      case None =>
+        throw new GlutenException(
+          String.format(
+            "Logical error, the function %s should not be a clickhouse Flattened function.",
+            name))
+    }
+}
+
+case class FlattenedAnd(
+    dataType: DataType,
+    children: Seq[Expression],
+    name: String,
+    nullable: Boolean)
+  extends CHFlattenedExpression(children, name) {}
+
+case class FlattenedOr(
+    dataType: DataType,
+    children: Seq[Expression],
+    name: String,
+    nullable: Boolean)
+  extends CHFlattenedExpression(children, name) {}
+
+object CHFlattenedExpression {
+
+  def sigAnd: Sig = Sig[FlattenedAnd]("FlattenedAnd")
+  def sigOr: Sig = Sig[FlattenedOr]("FlattenedOr")
+
+  def supported(name: String): Boolean = {
+    GlutenConfig.get.getSupportedFlattenedExpressions.split(",").exists(p => p.equals(name))
+  }
+
+  def genFlattenedExpression(
+      dataType: DataType,
+      children: Seq[Expression],
+      name: String,
+      nullable: Boolean): Option[CHFlattenedExpression] = name match {
+    case "and" => Option.apply(FlattenedAnd(dataType, children, name, nullable))
+    case "or" => Option.apply(FlattenedOr(dataType, children, name, nullable))
+    case _ => Option.empty
+  }
+
+}
diff --git a/...ends-clickhouse/src/main/scala/org/apache/gluten/extension/FlattenNestedExpressions.scala b/...ends-clickhouse/src/main/scala/org/apache/gluten/extension/FlattenNestedExpressions.scala
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.extension
+
+import org.apache.gluten.execution.{FilterExecTransformer, ProjectExecTransformer}
+import org.apache.gluten.expression.{CHFlattenedExpression, ExpressionMappings}
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.DataType
+
+/**
+ * Flatten nested expressions for optimization, to reduce expression calls. Now support `and`, `or`.
+ * e.g. select ... and(and(a=1, b=2), c=3) => select ... and(a=1, b=2, c=3).
+ */
+case class FlattenNestedExpressions(spark: SparkSession) extends Rule[SparkPlan] {
+
+  override def apply(plan: SparkPlan): SparkPlan = {
+    if (canBeOptimized(plan)) {
+      visitPlan(plan)
+    } else {
+      plan
+    }
+  }
+
+  private def canBeOptimized(plan: SparkPlan): Boolean = plan match {
+    case p: ProjectExecTransformer =>
+      var res = p.projectList.exists(c => c.isInstanceOf[And] || c.isInstanceOf[Or])
+      if (res) {
+        return false
+      }
+      res = p.projectList.exists(c => canBeOptimized(c))
+      if (!res) {
+        res = p.children.exists(c => canBeOptimized(c))
+      }
+      res
+    case f: FilterExecTransformer =>
+      var res = canBeOptimized(f.condition)
+      if (!res) {
+        res = canBeOptimized(f.child)
+      }
+      res
+    case _ => plan.children.exists(c => canBeOptimized(c))
+  }
+
+  private def canBeOptimized(expr: Expression): Boolean = {
+    var exprCall = expr
+    expr match {
+      case a: Alias => exprCall = a.child
+      case _ =>
+    }
+    val exprName = getExpressionName(exprCall)
+    exprName match {
+      case None =>
+        exprCall match {
+          case _: LeafExpression => false
+          case _ => exprCall.children.exists(c => canBeOptimized(c))
+        }
+      case Some(f) =>
+        CHFlattenedExpression.supported(f)
+    }
+  }
+
+  private def getExpressionName(expr: Expression): Option[String] = expr match {
+    case _: And => ExpressionMappings.expressionsMap.get(classOf[And])
+    case _: Or => ExpressionMappings.expressionsMap.get(classOf[Or])
+    case _ => Option.empty[String]
+  }
+
+  private def visitPlan(plan: SparkPlan): SparkPlan = plan match {
+    case p: ProjectExecTransformer =>
+      var newProjectList = Seq.empty[NamedExpression]
+      p.projectList.foreach {
+        case a: Alias =>
+          val newAlias = Alias(optimize(a.child), a.name)(a.exprId)
+          newProjectList :+= newAlias
+        case p =>
+          newProjectList :+= p
+      }
+      val newChild = visitPlan(p.child)
+      ProjectExecTransformer(newProjectList, newChild)
+    case f: FilterExecTransformer =>
+      val newCondition = optimize(f.condition)
+      val newChild = visitPlan(f.child)
+      FilterExecTransformer(newCondition, newChild)
+    case _ =>
+      val newChildren = plan.children.map(p => visitPlan(p))
+      plan.withNewChildren(newChildren)
+  }
+
+  private def optimize(expr: Expression): Expression = {
+    var resultExpr = expr
+    var name = getExpressionName(expr)
+    var children = Seq.empty[Expression]
+    var dataType = null.asInstanceOf[DataType]
+    var nestedFunctions = 0
+
+    def f(e: Expression, parent: Option[Expression] = Option.empty[Expression]): Unit = {
+      parent match {
+        case None =>
+          name = getExpressionName(e)
+          dataType = e.dataType
+        case _ =>
+      }
+      e match {
+        case a: And if canBeOptimized(a) =>
+          parent match {
+            case Some(_: And) | None =>
+              f(a.left, Option.apply(a))
+              f(a.right, Option.apply(a))
+              nestedFunctions += 1
+            case _ =>
+              children +:= optimize(a)
+          }
+        case o: Or if canBeOptimized(o) =>
+          parent match {
+            case Some(_: Or) | None =>
+              f(o.left, parent = Option.apply(o))
+              f(o.right, parent = Option.apply(o))
+              nestedFunctions += 1
+            case _ =>
+              children +:= optimize(o)
+          }
+        case _ =>
+          if (parent.nonEmpty) {
+            children +:= optimize(e)
+          } else {
+            children = Seq.empty[Expression]
+            nestedFunctions = 0
+            val exprNewChildren = e.children.map(p => optimize(p))
+            resultExpr = e.withNewChildren(exprNewChildren)
+          }
+      }
+    }
+    f(expr)
+    if ((nestedFunctions > 1 && name.isDefined) || flattenedExpressionExists(children)) {
+      CHFlattenedExpression.genFlattenedExpression(
+        dataType,
+        children,
+        name.getOrElse(""),
+        expr.nullable) match {
+        case Some(f) => f
+        case None => resultExpr
+      }
+    } else {
+      resultExpr
+    }
+  }
+
+  private def flattenedExpressionExists(children: Seq[Expression]): Boolean = {
+    var res = false
+    children.foreach {
+      case _: CHFlattenedExpression if !res => res = true
+      case c if !res => res = flattenedExpressionExists(c.children)
+      case _ =>
+    }
+    res
+  }
+}
diff --git a/...s-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/...s-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -17,6 +17,7 @@
 package org.apache.gluten.execution
 
 import org.apache.gluten.backendsapi.clickhouse.CHConfig
+import org.apache.gluten.expression.{FlattenedAnd, FlattenedOr}
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{DataFrame, GlutenTestUtils, Row}
@@ -381,6 +382,45 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS
     }
   }
 
+  test("GLUTEN-8557: Optimize nested and/or") {
+    def checkFlattenedFunctions(plan: SparkPlan, functionName: String, argNum: Int): Boolean = {
+
+      def checkExpression(expr: Expression, functionName: String, argNum: Int): Boolean =
+        expr match {
+          case s: FlattenedAnd if s.name.equals(functionName) && s.children.size == argNum =>
+            true
+          case o: FlattenedOr if o.name.equals(functionName) && o.children.size == argNum =>
+            true
+          case _ => expr.children.exists(c => checkExpression(c, functionName, argNum))
+        }
+      plan match {
+        case f: FilterExecTransformer => return checkExpression(f.condition, functionName, argNum)
+        case _ => return plan.children.exists(c => checkFlattenedFunctions(c, functionName, argNum))
+      }
+      false
+    }
+    runQueryAndCompare(
+      "SELECT count(1) from json_test where int_field1 = 5 and double_field1 > 1.0" +
+        " and string_field1 is not null") {
+      x => assert(checkFlattenedFunctions(x.queryExecution.executedPlan, "and", 5))
+    }
+    runQueryAndCompare(
+      "SELECT count(1) from json_test where int_field1 = 5 or double_field1 > 1.0" +
+        " or string_field1 is not null") {
+      x => assert(checkFlattenedFunctions(x.queryExecution.executedPlan, "or", 3))
+    }
+    runQueryAndCompare(
+      "SELECT count(1) from json_test where int_field1 = 5 and double_field1 > 1.0" +
+        " and double_field1 < 10 or int_field1 = 12 or string_field1 is not null") {
+      x =>
+        assert(
+          checkFlattenedFunctions(
+            x.queryExecution.executedPlan,
+            "and",
+            3) && checkFlattenedFunctions(x.queryExecution.executedPlan, "or", 3))
+    }
+  }
+
   test("Test covar_samp") {
     runQueryAndCompare("SELECT covar_samp(double_field1, int_field1) from json_test") { _ => }
   }