From 371741d50538cc890f8c47beea664ce7ef131f1e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:00:02 -0600 Subject: [PATCH 01/14] add methods to trait --- .../comet/serde/CometExpressionSerde.scala | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala index 20c0343037..d4746e28a2 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala @@ -37,6 +37,24 @@ trait CometExpressionSerde[T <: Expression] { */ def getExprConfigName(expr: T): String = expr.getClass.getSimpleName + /** + * Get documentation for usages where this expression may be incompatible with Spark. This + * is called from GenerateDocs when generating the Compatibility Guide. + * + * @return + * List of reasons, defaulting to an empty list. + */ + def getIncompatibleReasons(): Seq[String] = Seq.empty + + /** + * Get documentation for usages where this expression is unsupported with Spark. This + * is called from GenerateDocs when generating the Compatibility Guide. + * + * @return + * List of reasons, defaulting to an empty list. + */ + def getUnsupportedReasons(): Seq[String] = Seq.empty + /** * Determine the support level of the expression based on its attributes. * From 9f0cdd5cb2139bf26d4b2cf97930b0094adc3204 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:04:02 -0600 Subject: [PATCH 02/14] one example --- spark/src/main/scala/org/apache/comet/serde/arrays.scala | 2 -- spark/src/main/scala/org/apache/comet/serde/math.scala | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index 14d4536fc1..c73f6e20e9 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -57,8 +57,6 @@ object CometArrayRemove object CometArrayAppend extends CometExpressionSerde[ArrayAppend] { - override def getSupportLevel(expr: ArrayAppend): SupportLevel = Compatible() - override def convert( expr: ArrayAppend, inputs: Seq[Attribute], diff --git a/spark/src/main/scala/org/apache/comet/serde/math.scala b/spark/src/main/scala/org/apache/comet/serde/math.scala index a01d4cdf9d..f0fa1bd779 100644 --- a/spark/src/main/scala/org/apache/comet/serde/math.scala +++ b/spark/src/main/scala/org/apache/comet/serde/math.scala @@ -164,13 +164,17 @@ object CometUnhex extends CometExpressionSerde[Unhex] with MathExprBase { object CometAbs extends CometExpressionSerde[Abs] with MathExprBase { + val unsupportedReason = "Only integral, floating-point, and decimal types are supported" + + override def getUnsupportedReasons(): Seq[String] = Seq(unsupportedReason) + override def getSupportLevel(expr: Abs): SupportLevel = { expr.child.dataType match { case _: NumericType => Compatible() case _ => // Spark supports NumericType, DayTimeIntervalType, and YearMonthIntervalType - Unsupported(Some("Only integral, floating-point, and decimal types are supported")) + Unsupported(Some(unsupportedReason)) } } From 587adcf0daec035f49e620cf5de5143edf44a489 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:14:39 -0600 Subject: [PATCH 03/14] more --- .../org/apache/comet/serde/CometExpressionSerde.scala | 8 ++++---- spark/src/main/scala/org/apache/comet/serde/arrays.scala | 2 ++ .../src/main/scala/org/apache/comet/serde/datetime.scala | 8 ++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala index d4746e28a2..58c6910475 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala @@ -38,8 +38,8 @@ trait CometExpressionSerde[T <: Expression] { def getExprConfigName(expr: T): String = expr.getClass.getSimpleName /** - * Get documentation for usages where this expression may be incompatible with Spark. This - * is called from GenerateDocs when generating the Compatibility Guide. + * Get documentation for usages where this expression may be incompatible with Spark. This is + * called from GenerateDocs when generating the Compatibility Guide. * * @return * List of reasons, defaulting to an empty list. @@ -47,8 +47,8 @@ trait CometExpressionSerde[T <: Expression] { def getIncompatibleReasons(): Seq[String] = Seq.empty /** - * Get documentation for usages where this expression is unsupported with Spark. This - * is called from GenerateDocs when generating the Compatibility Guide. + * Get documentation for usages where this expression is unsupported with Spark. This is called + * from GenerateDocs when generating the Compatibility Guide. * * @return * List of reasons, defaulting to an empty list. diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index c73f6e20e9..14d4536fc1 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -57,6 +57,8 @@ object CometArrayRemove object CometArrayAppend extends CometExpressionSerde[ArrayAppend] { + override def getSupportLevel(expr: ArrayAppend): SupportLevel = Compatible() + override def convert( expr: ArrayAppend, inputs: Seq[Attribute], diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 5413e8b439..cc6bc0fec0 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -536,6 +536,14 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { // ISO formats "yyyy-MM-dd'T'HH:mm:ss" -> "%Y-%m-%dT%H:%M:%S") + override def getIncompatibleReasons(): Seq[String] = Seq( + "Non-UTC timezones may produce different results than Spark" + ) + + override def getUnsupportedReasons(): Seq[String] = Seq( + s"Only the following formats are supported:${supportedFormats.mkString("\n- ", "\n-", "")}" + ) + override def getSupportLevel(expr: DateFormatClass): SupportLevel = { // Check timezone - only UTC is fully compatible val timezone = expr.timeZoneId.getOrElse("UTC") From 03c71569cbc79f58893658e28121ec9d1cd3cd7b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:17:37 -0600 Subject: [PATCH 04/14] more --- .../main/scala/org/apache/comet/serde/datetime.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index cc6bc0fec0..1c0823759e 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -180,12 +180,16 @@ object CometQuarter extends CometExpressionSerde[Quarter] with CometExprGetDateF object CometHour extends CometExpressionSerde[Hour] { + val incompatReason = "Incorrectly applies timezone conversion to TimestampNTZ inputs" + + " (https://github.com/apache/datafusion-comet/issues/3180)" + + override def getIncompatibleReasons(): Seq[String] = Seq( + incompatReason + ) + override def getSupportLevel(expr: Hour): SupportLevel = { if (expr.child.dataType.typeName == "timestamp_ntz") { - Incompatible( - Some( - "Incorrectly applies timezone conversion to TimestampNTZ inputs" + - " (https://github.com/apache/datafusion-comet/issues/3180)")) + Incompatible(Some(incompatReason)) } else { Compatible() } From 654391ad97252843e6fad9bc589f42bf19d62b83 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:46:31 -0600 Subject: [PATCH 05/14] docs: generate per-category expression compatibility notes Wire GenerateDocs to emit incompatibility and unsupported notes into each expression compatibility page, driven by getIncompatibleReasons and getUnsupportedReasons on the serde traits. Add matching defaults to CometAggregateExpressionSerde so aggregate.md is covered too. Fix CometDateFormat.getUnsupportedReasons formatting. --- .../compatibility/expressions/aggregate.md | 3 + .../latest/compatibility/expressions/array.md | 3 + .../compatibility/expressions/datetime.md | 6 +- .../compatibility/expressions/struct.md | 3 + .../scala/org/apache/comet/GenerateDocs.scala | 67 ++++++++ .../serde/CometAggregateExpressionSerde.scala | 18 +++ .../apache/comet/serde/QueryPlanSerde.scala | 143 +++++++++--------- .../org/apache/comet/serde/datetime.scala | 14 +- 8 files changed, 177 insertions(+), 80 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility/expressions/aggregate.md b/docs/source/user-guide/latest/compatibility/expressions/aggregate.md index f017d899d2..e97593bf46 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/aggregate.md +++ b/docs/source/user-guide/latest/compatibility/expressions/aggregate.md @@ -19,6 +19,9 @@ under the License. # Aggregate Expressions + + + ## Incompatible Aggregates - **CollectSet**: Comet deduplicates NaN values (treats `NaN == NaN`) while Spark treats each NaN as a distinct value. diff --git a/docs/source/user-guide/latest/compatibility/expressions/array.md b/docs/source/user-guide/latest/compatibility/expressions/array.md index 71e911288c..fe10b43a05 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/array.md +++ b/docs/source/user-guide/latest/compatibility/expressions/array.md @@ -19,4 +19,7 @@ under the License. # Array Expressions + + + - **SortArray**: Nested arrays with `Struct` or `Null` child values are not supported natively and will fall back to Spark. diff --git a/docs/source/user-guide/latest/compatibility/expressions/datetime.md b/docs/source/user-guide/latest/compatibility/expressions/datetime.md index b18e6f723e..430d86576e 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/datetime.md +++ b/docs/source/user-guide/latest/compatibility/expressions/datetime.md @@ -19,9 +19,9 @@ under the License. # Date/Time Expressions -- **Hour, Minute, Second**: Incorrectly apply timezone conversion to TimestampNTZ inputs. TimestampNTZ stores local - time without timezone, so no conversion should be applied. These expressions work correctly with Timestamp inputs. - [#3180](https://github.com/apache/datafusion-comet/issues/3180) + + + - **TruncTimestamp (date_trunc)**: Produces incorrect results when used with non-UTC timezones. Compatible when timezone is UTC. [#2649](https://github.com/apache/datafusion-comet/issues/2649) diff --git a/docs/source/user-guide/latest/compatibility/expressions/struct.md b/docs/source/user-guide/latest/compatibility/expressions/struct.md index 2a207894cf..d940a8d52d 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/struct.md +++ b/docs/source/user-guide/latest/compatibility/expressions/struct.md @@ -19,5 +19,8 @@ under the License. # Struct Expressions + + + - **StructsToJson (to_json)**: Does not support `+Infinity` and `-Infinity` for numeric types (float, double). [#3016](https://github.com/apache/datafusion-comet/issues/3016) diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index ce3b42e78d..446466a7ba 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -39,10 +39,43 @@ object GenerateDocs { private val publicConfigs: Set[ConfigEntry[_]] = CometConf.allConfs.filter(_.isPublic).toSet + /** (expression class simple name, incompatible reasons, unsupported reasons) */ + private type CategoryNotes = Seq[(String, Seq[String], Seq[String])] + + /** + * Mapping from expression category to the compatibility guide page where that category's + * auto-generated notes should be written, along with a function that produces the notes for + * that category from the serde maps in `QueryPlanSerde`. + */ + private def categoryPages: Map[String, (String, () => CategoryNotes)] = Map( + "array" -> ("compatibility/expressions/array.md", + () => + QueryPlanSerde.arrayExpressions.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + }), + "datetime" -> ("compatibility/expressions/datetime.md", + () => + QueryPlanSerde.temporalExpressions.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + }), + "struct" -> ("compatibility/expressions/struct.md", + () => + QueryPlanSerde.structExpressions.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + }), + "aggregate" -> ("compatibility/expressions/aggregate.md", + () => + QueryPlanSerde.aggrSerdeMap.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + })) + def main(args: Array[String]): Unit = { val userGuideLocation = args(0) generateConfigReference(s"$userGuideLocation/configs.md") generateCompatibilityGuide(s"$userGuideLocation/compatibility/expressions/cast.md") + for ((category, (page, notesFn)) <- categoryPages) { + generateExpressionCompatNotes(s"$userGuideLocation/$page", category, notesFn()) + } } private def generateConfigReference(filename: String): Unit = { @@ -121,6 +154,40 @@ object GenerateDocs { w.close() } + private def generateExpressionCompatNotes( + filename: String, + category: String, + notes: CategoryNotes): Unit = { + val beginTag = s"" + val lines = readFile(filename) + val w = new BufferedOutputStream(new FileOutputStream(filename)) + for (line <- lines) { + w.write(s"${line.stripTrailing()}\n".getBytes) + if (line.trim == beginTag) { + writeExpressionCompatNotes(w, notes) + } + } + w.close() + } + + private def writeExpressionCompatNotes(w: BufferedOutputStream, notes: CategoryNotes): Unit = { + val sorted = notes.sortBy(_._1) + val incompat = sorted.flatMap { case (name, incompat, _) => incompat.map((name, _)) } + val unsupported = sorted.flatMap { case (name, _, unsupported) => unsupported.map((name, _)) } + if (incompat.nonEmpty) { + w.write("\n### Incompatible With Spark\n\n".getBytes) + for ((name, reason) <- incompat) { + w.write(s"- **$name**: $reason\n".getBytes) + } + } + if (unsupported.nonEmpty) { + w.write("\n### Unsupported Cases\n\n".getBytes) + for ((name, reason) <- unsupported) { + w.write(s"- **$name**: $reason\n".getBytes) + } + } + } + private def writeCastMatrixForMode(w: BufferedOutputStream, mode: CometEvalMode.Value): Unit = { val sortedTypes = CometCast.supportedTypes.sortBy(_.typeName) val typeNames = sortedTypes.map(_.typeName.replace("(10,2)", "")) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala index 0a5a2770b4..bc2f6fa793 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala @@ -39,6 +39,24 @@ trait CometAggregateExpressionSerde[T <: AggregateFunction] { */ def getExprConfigName(expr: T): String = expr.getClass.getSimpleName + /** + * Get documentation for usages where this expression may be incompatible with Spark. This is + * called from GenerateDocs when generating the Compatibility Guide. + * + * @return + * List of reasons, defaulting to an empty list. + */ + def getIncompatibleReasons(): Seq[String] = Seq.empty + + /** + * Get documentation for usages where this expression is unsupported with Spark. This is called + * from GenerateDocs when generating the Compatibility Guide. + * + * @return + * List of reasons, defaulting to an empty list. + */ + def getUnsupportedReasons(): Seq[String] = Seq.empty + /** * Determine the support level of the expression based on its attributes. * diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 768e4e0ed6..8114e54b4d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -46,7 +46,7 @@ import org.apache.comet.shims.CometExprShim */ object QueryPlanSerde extends Logging with CometExprShim { - private val arrayExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + private[comet] val arrayExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[ArrayAppend] -> CometArrayAppend, classOf[ArrayCompact] -> CometArrayCompact, classOf[ArrayContains] -> CometArrayContains, @@ -88,7 +88,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[Not] -> CometNot, classOf[Or] -> CometOr) - private val mathExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + private[comet] val mathExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Acos] -> CometScalarFunction("acos"), classOf[Add] -> CometAdd, classOf[Asin] -> CometScalarFunction("asin"), @@ -136,13 +136,14 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[MapContainsKey] -> CometMapContainsKey, classOf[MapFromEntries] -> CometMapFromEntries) - private val structExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( - classOf[CreateNamedStruct] -> CometCreateNamedStruct, - classOf[GetArrayStructFields] -> CometGetArrayStructFields, - classOf[GetStructField] -> CometGetStructField, - classOf[JsonToStructs] -> CometJsonToStructs, - classOf[StructsToJson] -> CometStructsToJson, - classOf[StructsToCsv] -> CometStructsToCsv) + private[comet] val structExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = + Map( + classOf[CreateNamedStruct] -> CometCreateNamedStruct, + classOf[GetArrayStructFields] -> CometGetArrayStructFields, + classOf[GetStructField] -> CometGetStructField, + classOf[JsonToStructs] -> CometJsonToStructs, + classOf[StructsToJson] -> CometStructsToJson, + classOf[StructsToCsv] -> CometStructsToCsv) private val hashExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Crc32] -> CometScalarFunction("crc32"), @@ -152,40 +153,41 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[XxHash64] -> CometXxHash64, classOf[Sha1] -> CometSha1) - private val stringExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( - classOf[Ascii] -> CometScalarFunction("ascii"), - classOf[BitLength] -> CometScalarFunction("bit_length"), - classOf[Chr] -> CometScalarFunction("char"), - classOf[ConcatWs] -> CometConcatWs, - classOf[Concat] -> CometConcat, - classOf[Contains] -> CometScalarFunction("contains"), - classOf[EndsWith] -> CometScalarFunction("ends_with"), - classOf[GetJsonObject] -> CometGetJsonObject, - classOf[InitCap] -> CometInitCap, - classOf[Length] -> CometLength, - classOf[Like] -> CometLike, - classOf[Lower] -> CometLower, - classOf[OctetLength] -> CometScalarFunction("octet_length"), - classOf[RegExpReplace] -> CometRegExpReplace, - classOf[Reverse] -> CometReverse, - classOf[RLike] -> CometRLike, - classOf[StartsWith] -> CometScalarFunction("starts_with"), - classOf[StringInstr] -> CometScalarFunction("instr"), - classOf[StringRepeat] -> CometStringRepeat, - classOf[StringReplace] -> CometScalarFunction("replace"), - classOf[StringRPad] -> CometStringRPad, - classOf[StringLPad] -> CometStringLPad, - classOf[StringSpace] -> CometScalarFunction("space"), - classOf[StringSplit] -> CometStringSplit, - classOf[StringTranslate] -> CometScalarFunction("translate"), - classOf[StringTrim] -> CometScalarFunction("trim"), - classOf[StringTrimBoth] -> CometScalarFunction("btrim"), - classOf[StringTrimLeft] -> CometScalarFunction("ltrim"), - classOf[StringTrimRight] -> CometScalarFunction("rtrim"), - classOf[Left] -> CometLeft, - classOf[Right] -> CometRight, - classOf[Substring] -> CometSubstring, - classOf[Upper] -> CometUpper) + private[comet] val stringExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = + Map( + classOf[Ascii] -> CometScalarFunction("ascii"), + classOf[BitLength] -> CometScalarFunction("bit_length"), + classOf[Chr] -> CometScalarFunction("char"), + classOf[ConcatWs] -> CometConcatWs, + classOf[Concat] -> CometConcat, + classOf[Contains] -> CometScalarFunction("contains"), + classOf[EndsWith] -> CometScalarFunction("ends_with"), + classOf[GetJsonObject] -> CometGetJsonObject, + classOf[InitCap] -> CometInitCap, + classOf[Length] -> CometLength, + classOf[Like] -> CometLike, + classOf[Lower] -> CometLower, + classOf[OctetLength] -> CometScalarFunction("octet_length"), + classOf[RegExpReplace] -> CometRegExpReplace, + classOf[Reverse] -> CometReverse, + classOf[RLike] -> CometRLike, + classOf[StartsWith] -> CometScalarFunction("starts_with"), + classOf[StringInstr] -> CometScalarFunction("instr"), + classOf[StringRepeat] -> CometStringRepeat, + classOf[StringReplace] -> CometScalarFunction("replace"), + classOf[StringRPad] -> CometStringRPad, + classOf[StringLPad] -> CometStringLPad, + classOf[StringSpace] -> CometScalarFunction("space"), + classOf[StringSplit] -> CometStringSplit, + classOf[StringTranslate] -> CometScalarFunction("translate"), + classOf[StringTrim] -> CometScalarFunction("trim"), + classOf[StringTrimBoth] -> CometScalarFunction("btrim"), + classOf[StringTrimLeft] -> CometScalarFunction("ltrim"), + classOf[StringTrimRight] -> CometScalarFunction("rtrim"), + classOf[Left] -> CometLeft, + classOf[Right] -> CometRight, + classOf[Substring] -> CometSubstring, + classOf[Upper] -> CometUpper) private val bitwiseExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[BitwiseAnd] -> CometBitwiseAnd, @@ -197,33 +199,34 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[ShiftLeft] -> CometShiftLeft, classOf[ShiftRight] -> CometShiftRight) - private val temporalExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( - classOf[DateAdd] -> CometDateAdd, - classOf[DateDiff] -> CometDateDiff, - classOf[DateFormatClass] -> CometDateFormat, - classOf[DateFromUnixDate] -> CometDateFromUnixDate, - classOf[Days] -> CometDays, - classOf[Hours] -> CometHours, - classOf[DateSub] -> CometDateSub, - classOf[UnixDate] -> CometUnixDate, - classOf[FromUnixTime] -> CometFromUnixTime, - classOf[LastDay] -> CometLastDay, - classOf[Hour] -> CometHour, - classOf[MakeDate] -> CometMakeDate, - classOf[Minute] -> CometMinute, - classOf[NextDay] -> CometNextDay, - classOf[Second] -> CometSecond, - classOf[TruncDate] -> CometTruncDate, - classOf[TruncTimestamp] -> CometTruncTimestamp, - classOf[UnixTimestamp] -> CometUnixTimestamp, - classOf[Year] -> CometYear, - classOf[Month] -> CometMonth, - classOf[DayOfMonth] -> CometDayOfMonth, - classOf[DayOfWeek] -> CometDayOfWeek, - classOf[WeekDay] -> CometWeekDay, - classOf[DayOfYear] -> CometDayOfYear, - classOf[WeekOfYear] -> CometWeekOfYear, - classOf[Quarter] -> CometQuarter) + private[comet] val temporalExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = + Map( + classOf[DateAdd] -> CometDateAdd, + classOf[DateDiff] -> CometDateDiff, + classOf[DateFormatClass] -> CometDateFormat, + classOf[DateFromUnixDate] -> CometDateFromUnixDate, + classOf[Days] -> CometDays, + classOf[Hours] -> CometHours, + classOf[DateSub] -> CometDateSub, + classOf[UnixDate] -> CometUnixDate, + classOf[FromUnixTime] -> CometFromUnixTime, + classOf[LastDay] -> CometLastDay, + classOf[Hour] -> CometHour, + classOf[MakeDate] -> CometMakeDate, + classOf[Minute] -> CometMinute, + classOf[NextDay] -> CometNextDay, + classOf[Second] -> CometSecond, + classOf[TruncDate] -> CometTruncDate, + classOf[TruncTimestamp] -> CometTruncTimestamp, + classOf[UnixTimestamp] -> CometUnixTimestamp, + classOf[Year] -> CometYear, + classOf[Month] -> CometMonth, + classOf[DayOfMonth] -> CometDayOfMonth, + classOf[DayOfWeek] -> CometDayOfWeek, + classOf[WeekDay] -> CometWeekDay, + classOf[DayOfYear] -> CometDayOfYear, + classOf[WeekOfYear] -> CometWeekOfYear, + classOf[Quarter] -> CometQuarter) private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Cast] -> CometCast) diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 1c0823759e..04d2f110e5 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -183,9 +183,7 @@ object CometHour extends CometExpressionSerde[Hour] { val incompatReason = "Incorrectly applies timezone conversion to TimestampNTZ inputs" + " (https://github.com/apache/datafusion-comet/issues/3180)" - override def getIncompatibleReasons(): Seq[String] = Seq( - incompatReason - ) + override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) override def getSupportLevel(expr: Hour): SupportLevel = { if (expr.child.dataType.typeName == "timestamp_ntz") { @@ -541,12 +539,14 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { "yyyy-MM-dd'T'HH:mm:ss" -> "%Y-%m-%dT%H:%M:%S") override def getIncompatibleReasons(): Seq[String] = Seq( - "Non-UTC timezones may produce different results than Spark" - ) + "Non-UTC timezones may produce different results than Spark") override def getUnsupportedReasons(): Seq[String] = Seq( - s"Only the following formats are supported:${supportedFormats.mkString("\n- ", "\n-", "")}" - ) + "Only the following formats are supported:" + + supportedFormats.toSeq + .sortBy(_._1) + .map { case (k, v) => s"`$k` -> `$v`" } + .mkString("\n - ", "\n - ", "")) override def getSupportLevel(expr: DateFormatClass): SupportLevel = { // Check timezone - only UTC is fully compatible From 3413595d55d43691a23f8a37683afcda575b0187 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:50:49 -0600 Subject: [PATCH 06/14] docs: migrate expression compat bullets into serdes Move hand-written incompatibility and unsupported notes for CollectSet, Average, SortArray, TruncTimestamp, and StructsToJson from the per-category markdown pages into the corresponding serde via getIncompatibleReasons / getUnsupportedReasons, so GenerateDocs drives the compatibility guide from a single source of truth. Clarify in the trait scaladoc that reasons should be written in Markdown. --- .../latest/compatibility/expressions/aggregate.md | 10 +--------- .../latest/compatibility/expressions/array.md | 2 -- .../latest/compatibility/expressions/datetime.md | 4 ---- .../latest/compatibility/expressions/struct.md | 3 --- .../comet/serde/CometAggregateExpressionSerde.scala | 6 ++++-- .../org/apache/comet/serde/CometExpressionSerde.scala | 6 ++++-- .../main/scala/org/apache/comet/serde/aggregates.scala | 9 +++++++++ .../src/main/scala/org/apache/comet/serde/arrays.scala | 4 ++++ .../main/scala/org/apache/comet/serde/datetime.scala | 4 ++++ .../main/scala/org/apache/comet/serde/structs.scala | 4 ++++ 10 files changed, 30 insertions(+), 22 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility/expressions/aggregate.md b/docs/source/user-guide/latest/compatibility/expressions/aggregate.md index e97593bf46..1721274bf9 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/aggregate.md +++ b/docs/source/user-guide/latest/compatibility/expressions/aggregate.md @@ -22,16 +22,8 @@ under the License. -## Incompatible Aggregates - -- **CollectSet**: Comet deduplicates NaN values (treats `NaN == NaN`) while Spark treats each NaN as a distinct value. - When `spark.comet.exec.strictFloatingPoint=true`, `collect_set` on floating-point types falls back to Spark unless - `spark.comet.expression.CollectSet.allowIncompatible=true` is set. - ## ANSI Mode -Comet will fall back to Spark for the following aggregate expressions when ANSI mode is enabled. These can be enabled by setting `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See the [Comet Supported Expressions Guide](../../expressions.md) for more information on this configuration setting. - -- Average (supports all numeric inputs except decimal types) +Comet will fall back to Spark for some aggregate expressions when ANSI mode is enabled. These can be enabled by setting `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See the [Comet Supported Expressions Guide](../../expressions.md) for more information on this configuration setting. There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support. diff --git a/docs/source/user-guide/latest/compatibility/expressions/array.md b/docs/source/user-guide/latest/compatibility/expressions/array.md index fe10b43a05..c7f2569b40 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/array.md +++ b/docs/source/user-guide/latest/compatibility/expressions/array.md @@ -21,5 +21,3 @@ under the License. - -- **SortArray**: Nested arrays with `Struct` or `Null` child values are not supported natively and will fall back to Spark. diff --git a/docs/source/user-guide/latest/compatibility/expressions/datetime.md b/docs/source/user-guide/latest/compatibility/expressions/datetime.md index 430d86576e..e07bde3c23 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/datetime.md +++ b/docs/source/user-guide/latest/compatibility/expressions/datetime.md @@ -21,7 +21,3 @@ under the License. - -- **TruncTimestamp (date_trunc)**: Produces incorrect results when used with non-UTC timezones. Compatible when - timezone is UTC. - [#2649](https://github.com/apache/datafusion-comet/issues/2649) diff --git a/docs/source/user-guide/latest/compatibility/expressions/struct.md b/docs/source/user-guide/latest/compatibility/expressions/struct.md index d940a8d52d..1eaaf4a5e2 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/struct.md +++ b/docs/source/user-guide/latest/compatibility/expressions/struct.md @@ -21,6 +21,3 @@ under the License. - -- **StructsToJson (to_json)**: Does not support `+Infinity` and `-Infinity` for numeric types (float, double). - [#3016](https://github.com/apache/datafusion-comet/issues/3016) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala index bc2f6fa793..ba220dcce8 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala @@ -41,7 +41,8 @@ trait CometAggregateExpressionSerde[T <: AggregateFunction] { /** * Get documentation for usages where this expression may be incompatible with Spark. This is - * called from GenerateDocs when generating the Compatibility Guide. + * called from GenerateDocs when generating the Compatibility Guide. Each reason should be + * written in Markdown and may span multiple lines. * * @return * List of reasons, defaulting to an empty list. @@ -50,7 +51,8 @@ trait CometAggregateExpressionSerde[T <: AggregateFunction] { /** * Get documentation for usages where this expression is unsupported with Spark. This is called - * from GenerateDocs when generating the Compatibility Guide. + * from GenerateDocs when generating the Compatibility Guide. Each reason should be written in + * Markdown and may span multiple lines. * * @return * List of reasons, defaulting to an empty list. diff --git a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala index 58c6910475..afad6f6bb2 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala @@ -39,7 +39,8 @@ trait CometExpressionSerde[T <: Expression] { /** * Get documentation for usages where this expression may be incompatible with Spark. This is - * called from GenerateDocs when generating the Compatibility Guide. + * called from GenerateDocs when generating the Compatibility Guide. Each reason should be + * written in Markdown and may span multiple lines. * * @return * List of reasons, defaulting to an empty list. @@ -48,7 +49,8 @@ trait CometExpressionSerde[T <: Expression] { /** * Get documentation for usages where this expression is unsupported with Spark. This is called - * from GenerateDocs when generating the Compatibility Guide. + * from GenerateDocs when generating the Compatibility Guide. Each reason should be written in + * Markdown and may span multiple lines. * * @return * List of reasons, defaulting to an empty list. diff --git a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala index 7d78bbe3e5..549776e7c3 100644 --- a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala +++ b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala @@ -151,6 +151,9 @@ object CometCount extends CometAggregateExpressionSerde[Count] { object CometAverage extends CometAggregateExpressionSerde[Average] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Falls back to Spark in ANSI mode. Supports all numeric inputs except decimal types.") + override def convert( aggExpr: AggregateExpression, avg: Average, @@ -666,6 +669,12 @@ object CometBloomFilterAggregate extends CometAggregateExpressionSerde[BloomFilt object CometCollectSet extends CometAggregateExpressionSerde[CollectSet] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Comet deduplicates NaN values (treats `NaN == NaN`) while Spark treats each NaN as a" + + s" distinct value. When `${COMET_EXEC_STRICT_FLOATING_POINT.key}=true`, `collect_set`" + + " on floating-point types falls back to Spark unless" + + " `spark.comet.expression.CollectSet.allowIncompatible=true` is set.") + override def getSupportLevel(expr: CollectSet): SupportLevel = { if (COMET_EXEC_STRICT_FLOATING_POINT.get() && SupportLevel.containsFloatingPoint(expr.children.head.dataType)) { diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index 14d4536fc1..41dd823cc8 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -123,6 +123,10 @@ object CometArrayContains extends CometExpressionSerde[ArrayContains] { object CometSortArray extends CometExpressionSerde[SortArray] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Nested arrays with `Struct` or `Null` child values are not supported natively and will" + + " fall back to Spark.") + private def supportedSortArrayElementType( dt: DataType, nestedInArray: Boolean = false): Boolean = { diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 04d2f110e5..74b88d8ebc 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -425,6 +425,10 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] { object CometTruncTimestamp extends CometExpressionSerde[TruncTimestamp] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Produces incorrect results when used with non-UTC timezones. Compatible when timezone is" + + " UTC. (https://github.com/apache/datafusion-comet/issues/2649)") + val supportedFormats: Seq[String] = Seq( "year", diff --git a/spark/src/main/scala/org/apache/comet/serde/structs.scala b/spark/src/main/scala/org/apache/comet/serde/structs.scala index 449d0fc5b9..a9fffcf6f7 100644 --- a/spark/src/main/scala/org/apache/comet/serde/structs.scala +++ b/spark/src/main/scala/org/apache/comet/serde/structs.scala @@ -105,6 +105,10 @@ object CometGetArrayStructFields extends CometExpressionSerde[GetArrayStructFiel object CometStructsToJson extends CometExpressionSerde[StructsToJson] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Does not support `+Infinity` and `-Infinity` for numeric types (float, double)." + + " (https://github.com/apache/datafusion-comet/issues/3016)") + override def getSupportLevel(expr: StructsToJson): SupportLevel = Incompatible( Some( From 7cc25ff666a946ffd800f39ad1f9b18d0098af28 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:55:09 -0600 Subject: [PATCH 07/14] remove old content --- .../latest/compatibility/expressions/aggregate.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility/expressions/aggregate.md b/docs/source/user-guide/latest/compatibility/expressions/aggregate.md index 1721274bf9..8d15eea43d 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/aggregate.md +++ b/docs/source/user-guide/latest/compatibility/expressions/aggregate.md @@ -21,9 +21,3 @@ under the License. - -## ANSI Mode - -Comet will fall back to Spark for some aggregate expressions when ANSI mode is enabled. These can be enabled by setting `spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See the [Comet Supported Expressions Guide](../../expressions.md) for more information on this configuration setting. - -There is an [epic](https://github.com/apache/datafusion-comet/issues/313) where we are tracking the work to fully implement ANSI support. From 55c05dc827a2af866ffe0512ccc5ff4561381cca Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 17:58:15 -0600 Subject: [PATCH 08/14] docs: add math expressions compatibility page Create compatibility/expressions/math.md with an EXPR_COMPAT marker block and wire it to QueryPlanSerde.mathExpressions in GenerateDocs so CometAbs's unsupported-reason note surfaces in the guide. Add math to the expressions toctree. --- .../latest/compatibility/expressions/index.md | 1 + .../latest/compatibility/expressions/math.md | 23 +++++++++++++++++++ .../scala/org/apache/comet/GenerateDocs.scala | 5 ++++ 3 files changed, 29 insertions(+) create mode 100644 docs/source/user-guide/latest/compatibility/expressions/math.md diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md b/docs/source/user-guide/latest/compatibility/expressions/index.md index 3084a3930b..2c4742f36d 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/index.md @@ -31,6 +31,7 @@ Compatibility notes are grouped by expression category: aggregate array datetime +math struct cast ``` diff --git a/docs/source/user-guide/latest/compatibility/expressions/math.md b/docs/source/user-guide/latest/compatibility/expressions/math.md new file mode 100644 index 0000000000..6d8905adf4 --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/math.md @@ -0,0 +1,23 @@ + + +# Math Expressions + + + diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 446466a7ba..5e4d82ca9c 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -58,6 +58,11 @@ object GenerateDocs { QueryPlanSerde.temporalExpressions.toSeq.map { case (cls, serde) => (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) }), + "math" -> ("compatibility/expressions/math.md", + () => + QueryPlanSerde.mathExpressions.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + }), "struct" -> ("compatibility/expressions/struct.md", () => QueryPlanSerde.structExpressions.toSeq.map { case (cls, serde) => From 2900e1f385d5c7466b132643b5d84e4eed3abbdd Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 23 Apr 2026 18:40:30 -0600 Subject: [PATCH 09/14] style: add explicit type annotations to public vals Satisfy scalafix DisableSyntax.noExplicitPublicVal by annotating CometHour.incompatReason and CometAbs.unsupportedReason as `: String`. --- spark/src/main/scala/org/apache/comet/serde/datetime.scala | 2 +- spark/src/main/scala/org/apache/comet/serde/math.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 74b88d8ebc..1842081824 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -180,7 +180,7 @@ object CometQuarter extends CometExpressionSerde[Quarter] with CometExprGetDateF object CometHour extends CometExpressionSerde[Hour] { - val incompatReason = "Incorrectly applies timezone conversion to TimestampNTZ inputs" + + val incompatReason: String = "Incorrectly applies timezone conversion to TimestampNTZ inputs" + " (https://github.com/apache/datafusion-comet/issues/3180)" override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) diff --git a/spark/src/main/scala/org/apache/comet/serde/math.scala b/spark/src/main/scala/org/apache/comet/serde/math.scala index f0fa1bd779..2f3a6902d6 100644 --- a/spark/src/main/scala/org/apache/comet/serde/math.scala +++ b/spark/src/main/scala/org/apache/comet/serde/math.scala @@ -164,7 +164,7 @@ object CometUnhex extends CometExpressionSerde[Unhex] with MathExprBase { object CometAbs extends CometExpressionSerde[Abs] with MathExprBase { - val unsupportedReason = "Only integral, floating-point, and decimal types are supported" + val unsupportedReason: String = "Only integral, floating-point, and decimal types are supported" override def getUnsupportedReasons(): Seq[String] = Seq(unsupportedReason) From c948feb6e79192c5e818a49916b351737d080a5a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 24 Apr 2026 07:37:05 -0600 Subject: [PATCH 10/14] improve doc format --- .../scala/org/apache/comet/GenerateDocs.scala | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 5e4d82ca9c..32c8c0fdcd 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -176,19 +176,25 @@ object GenerateDocs { } private def writeExpressionCompatNotes(w: BufferedOutputStream, notes: CategoryNotes): Unit = { - val sorted = notes.sortBy(_._1) - val incompat = sorted.flatMap { case (name, incompat, _) => incompat.map((name, _)) } - val unsupported = sorted.flatMap { case (name, _, unsupported) => unsupported.map((name, _)) } - if (incompat.nonEmpty) { - w.write("\n### Incompatible With Spark\n\n".getBytes) - for ((name, reason) <- incompat) { - w.write(s"- **$name**: $reason\n".getBytes) - } + val sorted = notes.sortBy(_._1).filter { case (_, incompat, unsupported) => + incompat.nonEmpty || unsupported.nonEmpty } - if (unsupported.nonEmpty) { - w.write("\n### Unsupported Cases\n\n".getBytes) - for ((name, reason) <- unsupported) { - w.write(s"- **$name**: $reason\n".getBytes) + for ((name, incompat, unsupported) <- sorted) { + w.write(s"\n### $name\n".getBytes) + if (incompat.nonEmpty) { + w.write( + (s"\nThe following incompatibilities cause `$name` to fall back to Spark by default." + + s" Set `spark.comet.expression.$name.allowIncompatible=true` to enable Comet" + + " acceleration despite these differences.\n\n").getBytes) + for (reason <- incompat) { + w.write(s"- $reason\n".getBytes) + } + } + if (unsupported.nonEmpty) { + w.write("\nThe following cases are not supported by Comet:\n\n".getBytes) + for (reason <- unsupported) { + w.write(s"- $reason\n".getBytes) + } } } } From a013aede8e5629d4010cf47c0a7c736282e5f6f2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 24 Apr 2026 08:25:43 -0600 Subject: [PATCH 11/14] docs: document getIncompatibleReasons and getUnsupportedReasons serde methods Add guidance to the contributor guide covering the new documentation methods on CometExpressionSerde. Also simplify CometDateFormat's getUnsupportedReasons to list only the supported Spark format keys. --- .../adding_a_new_expression.md | 65 ++++++++++++++++++- .../org/apache/comet/serde/datetime.scala | 5 +- 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/docs/source/contributor-guide/adding_a_new_expression.md b/docs/source/contributor-guide/adding_a_new_expression.md index 10af50e069..26a520b40d 100644 --- a/docs/source/contributor-guide/adding_a_new_expression.md +++ b/docs/source/contributor-guide/adding_a_new_expression.md @@ -70,10 +70,12 @@ object CometUnhex extends CometExpressionSerde[Unhex] { } ``` -The `CometExpressionSerde` trait provides three methods you can override: +The `CometExpressionSerde` trait provides several methods you can override: - `convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr]` - **Required**. Converts the Spark expression to protobuf. Return `None` if the expression cannot be converted. -- `getSupportLevel(expr: T): SupportLevel` - Optional. Returns the level of support for the expression. See "Using getSupportLevel" section below for details. +- `getSupportLevel(expr: T): SupportLevel` - Optional. Returns the level of support for the expression at planning time, based on a specific expression instance. See "Using getSupportLevel" section below for details. +- `getIncompatibleReasons(): Seq[String]` - Optional. Returns reasons why this expression may produce different results than Spark. Used to generate the Compatibility Guide. See "Documenting Incompatible and Unsupported Reasons" below. +- `getUnsupportedReasons(): Seq[String]` - Optional. Returns reasons why this expression may not be supported by Comet (for example, unsupported data types or format strings). Used to generate the Compatibility Guide. See "Documenting Incompatible and Unsupported Reasons" below. - `getExprConfigName(expr: T): String` - Optional. Returns a short name for configuration keys. Defaults to the Spark class name. For simple scalar functions that map directly to a DataFusion function, you can use the built-in `CometScalarFunction` implementation: @@ -208,6 +210,65 @@ When the query planner encounters an expression: Any notes provided will be logged to help with debugging and understanding why an expression was not used. +#### Documenting Incompatible and Unsupported Reasons + +In addition to `getSupportLevel`, which governs runtime planning decisions, the serde trait exposes two static documentation methods: + +- `getIncompatibleReasons(): Seq[String]` - Reasons the expression may produce different results than Spark. +- `getUnsupportedReasons(): Seq[String]` - Reasons the expression, or certain usages of it, may not be supported by Comet. + +These methods do not affect runtime behavior. They are called by `GenerateDocs` (`spark/src/main/scala/org/apache/comet/GenerateDocs.scala`) when building the user-facing Compatibility Guide pages under `docs/source/user-guide/latest/compatibility/expressions/` (for example, `math.md`, `datetime.md`, `array.md`, `aggregate.md`, `struct.md`). Each reason is rendered as a bullet in the corresponding page. + +Key differences from `getSupportLevel`: + +- **No expression instance.** Both methods take no arguments, so they describe the expression in general rather than a specific call site. Use `getSupportLevel` for checks that depend on data types, argument values, or other per-instance details. +- **Markdown-friendly.** Each returned string is written to a Markdown document, so you can embed backticks, links, and line breaks. Keep each reason self-contained, since they are rendered as separate bullets. +- **Regenerated by CI.** The lists are collected by `GenerateDocs` and published by CI on every merge to `main`. The generated Markdown is not committed to the repo, so you do not need to regenerate or commit it yourself. The reasons do not have to match the `notes` passed to `Compatible`, `Incompatible`, or `Unsupported`, but keeping them consistent avoids confusing users. + +##### Example: Incompatibility note + +```scala +object CometStructsToJson extends CometExpressionSerde[StructsToJson] { + + override def getIncompatibleReasons(): Seq[String] = Seq( + "Does not support `+Infinity` and `-Infinity` for numeric types (float, double)." + + " (https://github.com/apache/datafusion-comet/issues/3016)") + + override def getSupportLevel(expr: StructsToJson): SupportLevel = + Incompatible( + Some( + "Does not support Infinity/-Infinity for numeric types" + + " (https://github.com/apache/datafusion-comet/issues/3016)")) + + // ... convert ... +} +``` + +##### Example: Unsupported note + +```scala +object CometSortArray extends CometExpressionSerde[SortArray] { + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Nested arrays with `Struct` or `Null` child values are not supported natively and will" + + " fall back to Spark.") + + // ... convert ... +} +``` + +##### Example: Enumerating supported values + +When the expression only supports a known set of argument values, build the reason from that set so the documentation stays in sync with the code: + +```scala +override def getUnsupportedReasons(): Seq[String] = Seq( + "Only the following formats are supported:" + + supportedFormats.keys.toSeq.sorted + .map(k => s"`$k`") + .mkString("\n - ", "\n - ", "")) +``` + #### Adding Spark-side Tests for the New Expression It is important to verify that the new expression is correctly recognized by the native execution engine and matches the expected Spark behavior. The preferred way to add test coverage is to write a SQL test file using the SQL file test framework. This approach is simpler than writing Scala test code and makes it easy to cover many input combinations and edge cases. diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 1842081824..52b086184b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -547,9 +547,8 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { override def getUnsupportedReasons(): Seq[String] = Seq( "Only the following formats are supported:" + - supportedFormats.toSeq - .sortBy(_._1) - .map { case (k, v) => s"`$k` -> `$v`" } + supportedFormats.keys.toSeq.sorted + .map(k => s"`$k`") .mkString("\n - ", "\n - ", "")) override def getSupportLevel(expr: DateFormatClass): SupportLevel = { From abff4990c6824097bc0059b6e461d5bfadc91d15 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 24 Apr 2026 09:50:29 -0600 Subject: [PATCH 12/14] docs: add getIncompatibleReasons/getUnsupportedReasons to all expression serdes - Remove getSupportLevel override from CometArrayAppend (always returned Compatible, which is the default) - Add getIncompatibleReasons() to all always-Incompatible expression serdes - Add getIncompatibleReasons() and/or getUnsupportedReasons() to all conditional expression serdes that were missing them - Add compatibility guide pages for string, map, and misc expression categories - Register new categories in GenerateDocs so content is auto-generated Co-Authored-By: Claude Opus 4.6 --- .../latest/compatibility/expressions/index.md | 3 ++ .../latest/compatibility/expressions/map.md | 23 ++++++++++++ .../latest/compatibility/expressions/misc.md | 23 ++++++++++++ .../compatibility/expressions/string.md | 23 ++++++++++++ .../scala/org/apache/comet/GenerateDocs.scala | 15 ++++++++ .../apache/comet/expressions/CometCast.scala | 7 ++++ .../apache/comet/serde/CometSortOrder.scala | 4 +++ .../apache/comet/serde/QueryPlanSerde.scala | 4 +-- .../scala/org/apache/comet/serde/arrays.scala | 27 ++++++++++++-- .../comet/serde/collectionOperations.scala | 3 ++ .../comet/serde/contraintExpressions.scala | 3 ++ .../org/apache/comet/serde/datetime.scala | 19 ++++++++++ .../comet/serde/decimalExpressions.scala | 2 ++ .../org/apache/comet/serde/literals.scala | 3 ++ .../scala/org/apache/comet/serde/maps.scala | 3 ++ .../scala/org/apache/comet/serde/math.scala | 2 ++ .../org/apache/comet/serde/strings.scala | 35 +++++++++++++++++++ .../org/apache/comet/serde/structs.scala | 10 ++++++ .../org/apache/comet/serde/unixtime.scala | 5 +++ 19 files changed, 210 insertions(+), 4 deletions(-) create mode 100644 docs/source/user-guide/latest/compatibility/expressions/map.md create mode 100644 docs/source/user-guide/latest/compatibility/expressions/misc.md create mode 100644 docs/source/user-guide/latest/compatibility/expressions/string.md diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md b/docs/source/user-guide/latest/compatibility/expressions/index.md index 2c4742f36d..9fbec44f0b 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/index.md @@ -31,7 +31,10 @@ Compatibility notes are grouped by expression category: aggregate array datetime +map math +misc +string struct cast ``` diff --git a/docs/source/user-guide/latest/compatibility/expressions/map.md b/docs/source/user-guide/latest/compatibility/expressions/map.md new file mode 100644 index 0000000000..c8e2fdba2a --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/map.md @@ -0,0 +1,23 @@ + + +# Map Expressions + + + diff --git a/docs/source/user-guide/latest/compatibility/expressions/misc.md b/docs/source/user-guide/latest/compatibility/expressions/misc.md new file mode 100644 index 0000000000..58d9a0e122 --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/misc.md @@ -0,0 +1,23 @@ + + +# Miscellaneous Expressions + + + diff --git a/docs/source/user-guide/latest/compatibility/expressions/string.md b/docs/source/user-guide/latest/compatibility/expressions/string.md new file mode 100644 index 0000000000..d86f299c03 --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/string.md @@ -0,0 +1,23 @@ + + +# String Expressions + + + diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 32c8c0fdcd..2360508c6c 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -72,6 +72,21 @@ object GenerateDocs { () => QueryPlanSerde.aggrSerdeMap.toSeq.map { case (cls, serde) => (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + }), + "string" -> ("compatibility/expressions/string.md", + () => + QueryPlanSerde.stringExpressions.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + }), + "map" -> ("compatibility/expressions/map.md", + () => + QueryPlanSerde.mapExpressions.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + }), + "misc" -> ("compatibility/expressions/misc.md", + () => + QueryPlanSerde.miscExpressions.toSeq.map { case (cls, serde) => + (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) })) def main(args: Array[String]): Unit = { diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 3b4c4b3bd4..4a367040da 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -48,6 +48,13 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { DataTypes.TimestampType, DataTypes.TimestampNTZType) + override def getIncompatibleReasons(): Seq[String] = Seq( + "Some cast operations between specific type pairs may produce different results than Spark." + + " Refer to the compatibility guide for the full matrix of supported cast operations.") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Not all cast type combinations are supported. Unsupported casts fall back to Spark.") + override def getSupportLevel(cast: Cast): SupportLevel = { if (cast.child.isInstanceOf[Literal]) { // casting from literal is compatible because we delegate to Spark diff --git a/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala b/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala index 3390d86cee..3647645109 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala @@ -27,6 +27,10 @@ import org.apache.comet.serde.QueryPlanSerde.exprToProtoInternal object CometSortOrder extends CometExpressionSerde[SortOrder] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "When `" + CometConf.COMET_EXEC_STRICT_FLOATING_POINT.key + "=true`, sorting on" + + " floating-point types is not 100% compatible with Spark") + override def getSupportLevel(expr: SortOrder): SupportLevel = { if (CometConf.COMET_EXEC_STRICT_FLOATING_POINT.get() && diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 8114e54b4d..73a323a16b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -127,7 +127,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[Abs] -> CometAbs, classOf[Bin] -> CometScalarFunction("bin")) - private val mapExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + private[comet] val mapExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[GetMapValue] -> CometMapExtract, classOf[MapKeys] -> CometMapKeys, classOf[MapEntries] -> CometMapEntries, @@ -231,7 +231,7 @@ object QueryPlanSerde extends Logging with CometExprShim { private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Cast] -> CometCast) - private val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + private[comet] val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( // TODO PromotePrecision classOf[Alias] -> CometAlias, classOf[AttributeReference] -> CometAttributeReference, diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index 41dd823cc8..3cab40da78 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -57,8 +57,6 @@ object CometArrayRemove object CometArrayAppend extends CometExpressionSerde[ArrayAppend] { - override def getSupportLevel(expr: ArrayAppend): SupportLevel = Compatible() - override def convert( expr: ArrayAppend, inputs: Seq[Attribute], @@ -123,6 +121,10 @@ object CometArrayContains extends CometExpressionSerde[ArrayContains] { object CometSortArray extends CometExpressionSerde[SortArray] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "When `" + CometConf.COMET_EXEC_STRICT_FLOATING_POINT.key + "=true`, sorting on" + + " floating-point types is not 100% compatible with Spark") + override def getUnsupportedReasons(): Seq[String] = Seq( "Nested arrays with `Struct` or `Null` child values are not supported natively and will" + " fall back to Spark.") @@ -191,6 +193,9 @@ object CometSortArray extends CometExpressionSerde[SortArray] { object CometArrayIntersect extends CometExpressionSerde[ArrayIntersect] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Null handling and ordering may differ from Spark") + override def getSupportLevel(expr: ArrayIntersect): SupportLevel = Incompatible(None) override def convert( @@ -314,6 +319,9 @@ object CometArrayCompact extends CometExpressionSerde[Expression] { object CometArrayExcept extends CometExpressionSerde[ArrayExcept] with CometExprShim { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Null handling and ordering may differ from Spark") + override def getSupportLevel(expr: ArrayExcept): SupportLevel = Incompatible(None) @tailrec @@ -353,6 +361,8 @@ object CometArrayExcept extends CometExpressionSerde[ArrayExcept] with CometExpr object CometArrayJoin extends CometExpressionSerde[ArrayJoin] { + override def getIncompatibleReasons(): Seq[String] = Seq("Null handling may differ from Spark") + override def getSupportLevel(expr: ArrayJoin): SupportLevel = Incompatible(None) override def convert( @@ -390,6 +400,8 @@ object CometArrayJoin extends CometExpressionSerde[ArrayJoin] { object CometArrayInsert extends CometExpressionSerde[ArrayInsert] { + override def getIncompatibleReasons(): Seq[String] = Seq("Null handling may differ from Spark") + override def getSupportLevel(expr: ArrayInsert): SupportLevel = Incompatible(None) override def convert( @@ -498,6 +510,8 @@ object CometGetArrayItem extends CometExpressionSerde[GetArrayItem] { object CometArrayReverse extends CometExpressionSerde[Reverse] with ArraysBase { val unsupportedReason = "reverse on array containing binary is not supported" + override def getIncompatibleReasons(): Seq[String] = Seq(unsupportedReason) + @tailrec private def containsBinary(dt: DataType): Boolean = { dt match { @@ -589,6 +603,9 @@ object CometFlatten extends CometExpressionSerde[Flatten] with ArraysBase { object CometArrayFilter extends CometExpressionSerde[ArrayFilter] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `array_filter` when the function is `IsNotNull` (used by `array_compact`)") + override def getSupportLevel(expr: ArrayFilter): SupportLevel = { expr.function.children.headOption match { case Some(_: IsNotNull) => Compatible() @@ -606,6 +623,9 @@ object CometArrayFilter extends CometExpressionSerde[ArrayFilter] { object CometSize extends CometExpressionSerde[Size] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `ArrayType` input; `MapType` input is not supported") + override def getSupportLevel(expr: Size): SupportLevel = { expr.child.dataType match { case _: ArrayType => Compatible() @@ -660,6 +680,9 @@ object CometSize extends CometExpressionSerde[Size] { object CometArraysZip extends CometExpressionSerde[ArraysZip] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Not all input data types are supported; falls back to Spark for unsupported types") + private def isTypeSupported(dt: DataType): Boolean = { import DataTypes._ dt match { diff --git a/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala b/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala index 716241040e..4eb2ec6a0d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala +++ b/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala @@ -26,6 +26,9 @@ import org.apache.comet.serde.ExprOuterClass.Expr object CometReverse extends CometScalarFunction[Reverse]("reverse") { + override def getIncompatibleReasons(): Seq[String] = + CometArrayReverse.getIncompatibleReasons() + override def getSupportLevel(expr: Reverse): SupportLevel = { if (expr.child.dataType.isInstanceOf[ArrayType]) { CometArrayReverse.getSupportLevel(expr) diff --git a/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala b/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala index 62643d6da6..80a2a39ef4 100644 --- a/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala +++ b/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala @@ -28,6 +28,9 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn object CometKnownFloatingPointNormalized extends CometExpressionSerde[KnownFloatingPointNormalized] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `NormalizeNaNAndZero` child expressions") + override def getSupportLevel(expr: KnownFloatingPointNormalized): SupportLevel = { expr.child match { case _: NormalizeNaNAndZero => Compatible() diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 52b086184b..10580aab20 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -220,6 +220,10 @@ object CometHour extends CometExpressionSerde[Hour] { object CometMinute extends CometExpressionSerde[Minute] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Incorrectly applies timezone conversion to TimestampNTZ inputs" + + " (https://github.com/apache/datafusion-comet/issues/3180)") + override def getSupportLevel(expr: Minute): SupportLevel = { if (expr.child.dataType.typeName == "timestamp_ntz") { Incompatible( @@ -258,6 +262,10 @@ object CometMinute extends CometExpressionSerde[Minute] { object CometSecond extends CometExpressionSerde[Second] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Incorrectly applies timezone conversion to TimestampNTZ inputs" + + " (https://github.com/apache/datafusion-comet/issues/3180)") + override def getSupportLevel(expr: Second): SupportLevel = { if (expr.child.dataType.typeName == "timestamp_ntz") { Incompatible( @@ -296,6 +304,11 @@ object CometSecond extends CometExpressionSerde[Second] { object CometUnixTimestamp extends CometExpressionSerde[UnixTimestamp] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only `TimestampType` and `DateType` inputs are supported." + + " `TimestampNTZType` is not supported because Comet incorrectly applies timezone" + + " conversion to TimestampNTZ values.") + private def isSupportedInputType(expr: UnixTimestamp): Boolean = { // Note: TimestampNTZType is not supported because Comet incorrectly applies // timezone conversion to TimestampNTZ values. TimestampNTZ stores local time @@ -392,6 +405,12 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] { val supportedFormats: Seq[String] = Seq("year", "yyyy", "yy", "quarter", "mon", "month", "mm", "week") + override def getIncompatibleReasons(): Seq[String] = Seq( + "Non-literal format strings will throw an exception instead of returning NULL") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only the following formats are supported: " + supportedFormats.mkString(", ")) + override def getSupportLevel(expr: TruncDate): SupportLevel = { expr.format match { case Literal(fmt: UTF8String, _) => diff --git a/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala b/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala index 880f01742b..52adf950c1 100644 --- a/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala +++ b/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala @@ -39,6 +39,8 @@ object CometUnscaledValue extends CometExpressionSerde[UnscaledValue] { object CometMakeDecimal extends CometExpressionSerde[MakeDecimal] { + override def getUnsupportedReasons(): Seq[String] = Seq("Only `LongType` input is supported") + override def getSupportLevel(expr: MakeDecimal): SupportLevel = { expr.child.dataType match { case LongType => Compatible() diff --git a/spark/src/main/scala/org/apache/comet/serde/literals.scala b/spark/src/main/scala/org/apache/comet/serde/literals.scala index e24b55449c..c81b146a89 100644 --- a/spark/src/main/scala/org/apache/comet/serde/literals.scala +++ b/spark/src/main/scala/org/apache/comet/serde/literals.scala @@ -37,6 +37,9 @@ import org.apache.comet.serde.Types.ListLiteral object CometLiteral extends CometExpressionSerde[Literal] with Logging { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Not all data types are supported for literal values") + override def getSupportLevel(expr: Literal): SupportLevel = { if (supportedDataType( diff --git a/spark/src/main/scala/org/apache/comet/serde/maps.scala b/spark/src/main/scala/org/apache/comet/serde/maps.scala index ceafc157c4..1d2fcf9366 100644 --- a/spark/src/main/scala/org/apache/comet/serde/maps.scala +++ b/spark/src/main/scala/org/apache/comet/serde/maps.scala @@ -137,6 +137,9 @@ object CometMapFromEntries extends CometScalarFunction[MapFromEntries]("map_from val keyUnsupportedReason = "Using BinaryType as Map keys is not allowed in map_from_entries" val valueUnsupportedReason = "Using BinaryType as Map values is not allowed in map_from_entries" + override def getIncompatibleReasons(): Seq[String] = + Seq(keyUnsupportedReason, valueUnsupportedReason) + private def containsBinary(dataType: DataType): Boolean = { dataType match { case BinaryType => true diff --git a/spark/src/main/scala/org/apache/comet/serde/math.scala b/spark/src/main/scala/org/apache/comet/serde/math.scala index 2f3a6902d6..401d14cc78 100644 --- a/spark/src/main/scala/org/apache/comet/serde/math.scala +++ b/spark/src/main/scala/org/apache/comet/serde/math.scala @@ -205,6 +205,8 @@ sealed trait MathExprBase { object CometCheckOverflow extends CometExpressionSerde[CheckOverflow] { + override def getUnsupportedReasons(): Seq[String] = Seq("Only `DecimalType` is supported") + override def getSupportLevel(expr: CheckOverflow): SupportLevel = { if (expr.dataType.isInstanceOf[DecimalType]) { Compatible() diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 50621fc389..5355edf415 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -68,6 +68,8 @@ object CometUpper extends CometCaseConversionBase[Upper]("upper") object CometLower extends CometCaseConversionBase[Lower]("lower") object CometLength extends CometScalarFunction[Length]("length") { + override def getUnsupportedReasons(): Seq[String] = Seq("`BinaryType` input is not supported") + override def getSupportLevel(expr: Length): SupportLevel = expr.child.dataType match { case _: BinaryType => Unsupported(Some("Length on BinaryType is not supported")) case _ => Compatible() @@ -76,6 +78,11 @@ object CometLength extends CometScalarFunction[Length]("length") { object CometInitCap extends CometScalarFunction[InitCap]("initcap") { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Treats hyphen as a word separator (e.g. `robert rose-smith` produces `Robert Rose-Smith`" + + " instead of Spark's `Robert Rose-smith`)" + + " (https://github.com/apache/datafusion-comet/issues/1052)") + override def getSupportLevel(expr: InitCap): SupportLevel = { // Behavior differs from Spark. One example is that for the input "robert rose-smith", Spark // will produce "Robert Rose-smith", but Comet will produce "Robert Rose-Smith". @@ -136,6 +143,9 @@ object CometLeft extends CometExpressionSerde[Left] { } } + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `BinaryType` and `StringType` input") + override def getSupportLevel(expr: Left): SupportLevel = { expr.str.dataType match { case _: BinaryType | _: StringType => Compatible() @@ -179,6 +189,8 @@ object CometRight extends CometExpressionSerde[Right] { } } + override def getUnsupportedReasons(): Seq[String] = Seq("Only supports `StringType` input") + override def getSupportLevel(expr: Right): SupportLevel = { expr.str.dataType match { case _: StringType => Compatible() @@ -190,6 +202,8 @@ object CometRight extends CometExpressionSerde[Right] { object CometConcat extends CometScalarFunction[Concat]("concat") { val unsupportedReason = "CONCAT supports only string input parameters" + override def getIncompatibleReasons(): Seq[String] = Seq(unsupportedReason) + override def getSupportLevel(expr: Concat): SupportLevel = { if (expr.children.forall(_.dataType == DataTypes.StringType)) { Compatible() @@ -269,6 +283,10 @@ object CometRLike extends CometExpressionSerde[RLike] { object CometStringRPad extends CometExpressionSerde[StringRPad] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Scalar values are not supported for the `str` argument." + + " Only scalar values are supported for the `pad` argument.") + override def getSupportLevel(expr: StringRPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { return Unsupported(Some("Scalar values are not supported for the str argument")) @@ -294,6 +312,10 @@ object CometStringRPad extends CometExpressionSerde[StringRPad] { object CometStringLPad extends CometExpressionSerde[StringLPad] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Scalar values are not supported for the `str` argument." + + " Only scalar values are supported for the `pad` argument.") + override def getSupportLevel(expr: StringLPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { return Unsupported(Some("Scalar values are not supported for the str argument")) @@ -317,6 +339,12 @@ object CometStringLPad extends CometExpressionSerde[StringLPad] { } object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Regexp pattern may not be compatible with Spark") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `regexp_replace` with an offset of 1 (no offset)") + override def getSupportLevel(expr: RegExpReplace): SupportLevel = { if (!RegExp.isSupportedPattern(expr.regexp.toString) && !CometConf.isExprAllowIncompat("regexp")) { @@ -361,6 +389,9 @@ object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { */ object CometStringSplit extends CometExpressionSerde[StringSplit] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Regex engine differences between Java and Rust") + override def getSupportLevel(expr: StringSplit): SupportLevel = Incompatible(Some("Regex engine differences between Java and Rust")) @@ -384,6 +415,10 @@ object CometStringSplit extends CometExpressionSerde[StringSplit] { object CometGetJsonObject extends CometExpressionSerde[GetJsonObject] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Spark allows single-quoted JSON and unescaped control characters which Comet does not" + + " support") + override def getSupportLevel(expr: GetJsonObject): SupportLevel = Incompatible( Some( diff --git a/spark/src/main/scala/org/apache/comet/serde/structs.scala b/spark/src/main/scala/org/apache/comet/serde/structs.scala index a9fffcf6f7..4406f68114 100644 --- a/spark/src/main/scala/org/apache/comet/serde/structs.scala +++ b/spark/src/main/scala/org/apache/comet/serde/structs.scala @@ -181,6 +181,9 @@ object CometStructsToJson extends CometExpressionSerde[StructsToJson] { object CometJsonToStructs extends CometExpressionSerde[JsonToStructs] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Partially implemented and not comprehensively tested") + override def getSupportLevel(expr: JsonToStructs): SupportLevel = { // this feature is partially implemented and not comprehensively tested yet Incompatible() @@ -247,6 +250,13 @@ object CometStructsToCsv extends CometExpressionSerde[StructsToCsv] { private val incompatibleDataTypes = Seq(DateType, TimestampType, TimestampNTZType, BinaryType) + override def getIncompatibleReasons(): Seq[String] = Seq( + "Date, Timestamp, TimestampNTZ, and Binary data types may produce different results" + + " (https://github.com/apache/datafusion-comet/issues/3232)") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Complex types (arrays, maps, structs) in the schema are not supported") + override def getSupportLevel(expr: StructsToCsv): SupportLevel = { val dataTypes = expr.inputSchema.fields.map(_.dataType) val containsComplexType = dataTypes.exists(DataTypeSupport.isComplexType) diff --git a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala index 198c7d3101..e5eeb5b848 100644 --- a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala @@ -29,6 +29,11 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn // https://github.com/apache/datafusion/issues/16594 object CometFromUnixTime extends CometExpressionSerde[FromUnixTime] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Only supports the default datetime format pattern `yyyy-MM-dd HH:mm:ss`." + + " DataFusion's valid timestamp range differs from Spark" + + " (https://github.com/apache/datafusion/issues/16594)") + override def getSupportLevel(expr: FromUnixTime): SupportLevel = Incompatible(None) override def convert( From 344f3778f209cd2a97e155bf837dd0f58f768d8d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 24 Apr 2026 09:52:23 -0600 Subject: [PATCH 13/14] docs: address PR feedback - add Support Levels section and use links for references Co-Authored-By: Claude Opus 4.6 --- .../adding_a_new_expression.md | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/source/contributor-guide/adding_a_new_expression.md b/docs/source/contributor-guide/adding_a_new_expression.md index 26a520b40d..eabc2c25d0 100644 --- a/docs/source/contributor-guide/adding_a_new_expression.md +++ b/docs/source/contributor-guide/adding_a_new_expression.md @@ -73,9 +73,9 @@ object CometUnhex extends CometExpressionSerde[Unhex] { The `CometExpressionSerde` trait provides several methods you can override: - `convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr]` - **Required**. Converts the Spark expression to protobuf. Return `None` if the expression cannot be converted. -- `getSupportLevel(expr: T): SupportLevel` - Optional. Returns the level of support for the expression at planning time, based on a specific expression instance. See "Using getSupportLevel" section below for details. -- `getIncompatibleReasons(): Seq[String]` - Optional. Returns reasons why this expression may produce different results than Spark. Used to generate the Compatibility Guide. See "Documenting Incompatible and Unsupported Reasons" below. -- `getUnsupportedReasons(): Seq[String]` - Optional. Returns reasons why this expression may not be supported by Comet (for example, unsupported data types or format strings). Used to generate the Compatibility Guide. See "Documenting Incompatible and Unsupported Reasons" below. +- `getSupportLevel(expr: T): SupportLevel` - Optional. Returns the [support level](#support-levels) for the expression at planning time, based on a specific expression instance. See [Using getSupportLevel](#using-getsupportlevel) below for details. +- `getIncompatibleReasons(): Seq[String]` - Optional. Returns reasons why this expression may produce different results than Spark. Used to generate the Compatibility Guide. See [Documenting Incompatible and Unsupported Reasons](#documenting-incompatible-and-unsupported-reasons) below. +- `getUnsupportedReasons(): Seq[String]` - Optional. Returns reasons why this expression may not be supported by Comet (for example, unsupported data types or format strings). Used to generate the Compatibility Guide. See [Documenting Incompatible and Unsupported Reasons](#documenting-incompatible-and-unsupported-reasons) below. - `getExprConfigName(expr: T): String` - Optional. Returns a short name for configuration keys. Defaults to the Spark class name. For simple scalar functions that map directly to a DataFusion function, you can use the built-in `CometScalarFunction` implementation: @@ -103,6 +103,16 @@ A few things to note: - `scalarFunctionExprToProtoWithReturnType` is for scalar functions that need to return type information. Your expression may use a different method depending on the type of expression. - Use helper methods like `createBinaryExpr` and `createUnaryExpr` from `QueryPlanSerde` for common expression patterns. +#### Support Levels + +The `SupportLevel` sealed trait has three possible values: + +- **`Compatible(notes: Option[String] = None)`** - Comet supports this expression with full compatibility with Spark, or may have known differences in specific edge cases unlikely to affect most users. This is the default if you don't override `getSupportLevel`. +- **`Incompatible(notes: Option[String] = None)`** - Comet supports this expression but results can differ from Spark. The expression will only be used if `spark.comet.expr.allowIncompatible=true` or the expression-specific config `spark.comet.expr..allowIncompatible=true` is set. +- **`Unsupported(notes: Option[String] = None)`** - Comet does not support this expression under the current conditions. Spark will fall back to its native execution. + +All three accept an optional `notes` parameter to provide additional context that is logged for debugging. + #### Using getSupportLevel The `getSupportLevel` method allows you to control whether an expression should be executed by Comet based on various conditions such as data types, parameter values, or other expression-specific constraints. This is particularly useful when: @@ -111,14 +121,6 @@ The `getSupportLevel` method allows you to control whether an expression should 2. Your expression has known incompatibilities with Spark's behavior 3. Your expression has edge cases that aren't yet supported -The method returns one of three `SupportLevel` values: - -- **`Compatible(notes: Option[String] = None)`** - Comet supports this expression with full compatibility with Spark, or may have known differences in specific edge cases that are unlikely to be an issue for most users. This is the default if you don't override `getSupportLevel`. -- **`Incompatible(notes: Option[String] = None)`** - Comet supports this expression but results can be different from Spark. The expression will only be used if `spark.comet.expr.allowIncompatible=true` or the expression-specific config `spark.comet.expr..allowIncompatible=true` is set. -- **`Unsupported(notes: Option[String] = None)`** - Comet does not support this expression under the current conditions. The expression will not be used and Spark will fall back to its native execution. - -All three support levels accept an optional `notes` parameter to provide additional context about the support level. - ##### Examples **Example 1: Restricting to specific data types** From 01c95274d99c72d72921185f4f2f4c80a90e4c80 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 24 Apr 2026 11:13:48 -0600 Subject: [PATCH 14/14] docs: move compat details out of expressions.md and add getCompatibleNotes Remove Spark-Compatible? and Compatibility Notes columns from expressions.md; those details now live in the generated Compatibility Guide. Add getCompatibleNotes() to CometExpressionSerde and CometAggregateExpressionSerde for differences that are always present and do not require opting in via allowIncompatible, rendered as a distinct section in the compatibility guide. Backfill reasons in serdes that previously only appeared in expressions.md. --- docs/source/user-guide/latest/expressions.md | 464 +++++++++--------- .../scala/org/apache/comet/GenerateDocs.scala | 68 ++- .../serde/CometAggregateExpressionSerde.scala | 12 + .../comet/serde/CometExpressionSerde.scala | 12 + .../org/apache/comet/serde/aggregates.scala | 10 + .../scala/org/apache/comet/serde/arrays.scala | 3 + .../org/apache/comet/serde/strings.scala | 18 +- .../org/apache/comet/serde/structs.scala | 2 + 8 files changed, 339 insertions(+), 250 deletions(-) diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index a90070892f..f99f60141c 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -19,204 +19,200 @@ # Supported Spark Expressions -Comet supports the following Spark expressions. Expressions that are marked as Spark-compatible will either run -natively in Comet and provide the same results as Spark, or will fall back to Spark for cases that would not -be compatible. +Comet supports the following Spark expressions. See the [Comet Compatibility Guide] for details on known +incompatibilities and unsupported cases. All expressions are enabled by default, but most can be disabled by setting `spark.comet.expression.EXPRNAME.enabled=false`, where `EXPRNAME` is the expression name as specified in the following tables, such as `Length`, or `StartsWith`. See the [Comet Configuration Guide] for a full list of expressions that be disabled. -Expressions that are not Spark-compatible will fall back to Spark by default and can be enabled by setting -`spark.comet.expression.EXPRNAME.allowIncompatible=true`. - ## Conditional Expressions -| Expression | SQL | Spark-Compatible? | -| ---------- | ------------------------------------------- | ----------------- | -| CaseWhen | `CASE WHEN expr THEN expr ELSE expr END` | Yes | -| If | `IF(predicate_expr, true_expr, false_expr)` | Yes | +| Expression | SQL | +| ---------- | ------------------------------------------- | +| CaseWhen | `CASE WHEN expr THEN expr ELSE expr END` | +| If | `IF(predicate_expr, true_expr, false_expr)` | ## Predicate Expressions -| Expression | SQL | Spark-Compatible? | -| ------------------ | ------------- | ----------------- | -| And | `AND` | Yes | -| EqualTo | `=` | Yes | -| EqualNullSafe | `<=>` | Yes | -| GreaterThan | `>` | Yes | -| GreaterThanOrEqual | `>=` | Yes | -| LessThan | `<` | Yes | -| LessThanOrEqual | `<=` | Yes | -| In | `IN` | Yes | -| IsNotNull | `IS NOT NULL` | Yes | -| IsNull | `IS NULL` | Yes | -| InSet | `IN (...)` | Yes | -| Not | `NOT` | Yes | -| Or | `OR` | Yes | +| Expression | SQL | +| ------------------ | ------------- | +| And | `AND` | +| EqualTo | `=` | +| EqualNullSafe | `<=>` | +| GreaterThan | `>` | +| GreaterThanOrEqual | `>=` | +| LessThan | `<` | +| LessThanOrEqual | `<=` | +| In | `IN` | +| IsNotNull | `IS NOT NULL` | +| IsNull | `IS NULL` | +| InSet | `IN (...)` | +| Not | `NOT` | +| Or | `OR` | ## String Functions -| Expression | Spark-Compatible? | Compatibility Notes | -| --------------- | ----------------- | ---------------------------------------------------------------------------------------------------------- | -| Ascii | Yes | | -| BitLength | Yes | | -| Chr | Yes | | -| Concat | Yes | Only string inputs are supported | -| ConcatWs | Yes | | -| Contains | Yes | | -| EndsWith | Yes | | -| InitCap | No | Behavior is different in some cases, such as hyphenated names. | -| Left | Yes | Length argument must be a literal value | -| Length | Yes | | -| Like | Yes | | -| Lower | No | Results can vary depending on locale and character set. Requires `spark.comet.caseConversion.enabled=true` | -| OctetLength | Yes | | -| Reverse | Yes | | -| RLike | No | Uses Rust regexp engine, which has different behavior to Java regexp engine | -| StartsWith | Yes | | -| StringInstr | Yes | | -| StringRepeat | Yes | Negative argument for number of times to repeat causes exception | -| StringReplace | Yes | | -| StringLPad | Yes | | -| StringRPad | Yes | | -| StringSpace | Yes | | -| StringTranslate | Yes | | -| StringTrim | Yes | | -| StringTrimBoth | Yes | | -| StringTrimLeft | Yes | | -| StringTrimRight | Yes | | -| Substring | Yes | | -| Upper | No | Results can vary depending on locale and character set. Requires `spark.comet.caseConversion.enabled=true` | +| Expression | +| --------------- | +| Ascii | +| BitLength | +| Chr | +| Concat | +| ConcatWs | +| Contains | +| EndsWith | +| InitCap | +| Left | +| Length | +| Like | +| Lower | +| OctetLength | +| Reverse | +| RLike | +| StartsWith | +| StringInstr | +| StringRepeat | +| StringReplace | +| StringLPad | +| StringRPad | +| StringSpace | +| StringTranslate | +| StringTrim | +| StringTrimBoth | +| StringTrimLeft | +| StringTrimRight | +| Substring | +| Upper | ## JSON Functions -| Expression | Spark-Compatible? | Compatibility Notes | -| ------------- | ----------------- | --------------------------------------------------------------------------------------------- | -| GetJsonObject | No | Spark allows single-quoted JSON and unescaped control characters which Comet does not support | +| Expression | +| ------------- | +| GetJsonObject | ## Date/Time Functions -| Expression | SQL | Spark-Compatible? | Compatibility Notes | -| -------------- | ---------------------------- | ----------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| DateAdd | `date_add` | Yes | | -| DateDiff | `datediff` | Yes | | -| DateFormat | `date_format` | Yes | Partial support. Only specific format patterns are supported. | -| DateSub | `date_sub` | Yes | | -| DatePart | `date_part(field, source)` | Yes | Supported values of `field`: `year`/`month`/`week`/`day`/`dayofweek`/`dayofweek_iso`/`doy`/`quarter`/`hour`/`minute` | -| Days | `days` | Yes | V2 partition transform. Supports DateType and TimestampType inputs. | -| Extract | `extract(field FROM source)` | Yes | Supported values of `field`: `year`/`month`/`week`/`day`/`dayofweek`/`dayofweek_iso`/`doy`/`quarter`/`hour`/`minute` | -| FromUnixTime | `from_unixtime` | No | Does not support format, supports only -8334601211038 <= sec <= 8210266876799 | -| Hour | `hour` | No | Incorrectly applies timezone conversion to TimestampNTZ inputs ([#3180](https://github.com/apache/datafusion-comet/issues/3180)) | -| LastDay | `last_day` | Yes | | -| Minute | `minute` | No | Incorrectly applies timezone conversion to TimestampNTZ inputs ([#3180](https://github.com/apache/datafusion-comet/issues/3180)) | -| Second | `second` | No | Incorrectly applies timezone conversion to TimestampNTZ inputs ([#3180](https://github.com/apache/datafusion-comet/issues/3180)) | -| TruncDate | `trunc` | Yes | | -| TruncTimestamp | `date_trunc` | No | Incorrect results in non-UTC timezones ([#2649](https://github.com/apache/datafusion-comet/issues/2649)) | -| UnixDate | `unix_date` | Yes | | -| UnixTimestamp | `unix_timestamp` | Yes | | -| Year | `year` | Yes | | -| Month | `month` | Yes | | -| DayOfMonth | `day`/`dayofmonth` | Yes | | -| DayOfWeek | `dayofweek` | Yes | | -| WeekDay | `weekday` | Yes | | -| DayOfYear | `dayofyear` | Yes | | -| WeekOfYear | `weekofyear` | Yes | | -| Quarter | `quarter` | Yes | | +| Expression | SQL | +| -------------- | ---------------------------- | +| DateAdd | `date_add` | +| DateDiff | `datediff` | +| DateFormat | `date_format` | +| DateSub | `date_sub` | +| DatePart | `date_part(field, source)` | +| Days | `days` | +| Extract | `extract(field FROM source)` | +| FromUnixTime | `from_unixtime` | +| Hour | `hour` | +| LastDay | `last_day` | +| Minute | `minute` | +| Second | `second` | +| TruncDate | `trunc` | +| TruncTimestamp | `date_trunc` | +| UnixDate | `unix_date` | +| UnixTimestamp | `unix_timestamp` | +| Year | `year` | +| Month | `month` | +| DayOfMonth | `day`/`dayofmonth` | +| DayOfWeek | `dayofweek` | +| WeekDay | `weekday` | +| DayOfYear | `dayofyear` | +| WeekOfYear | `weekofyear` | +| Quarter | `quarter` | ## Math Expressions -| Expression | SQL | Spark-Compatible? | Compatibility Notes | -| -------------- | --------- | ----------------- | --------------------------------- | -| Abs | `abs` | Yes | | -| Acos | `acos` | Yes | | -| Add | `+` | Yes | | -| Asin | `asin` | Yes | | -| Atan | `atan` | Yes | | -| Atan2 | `atan2` | Yes | | -| BRound | `bround` | Yes | | -| Ceil | `ceil` | Yes | | -| Cos | `cos` | Yes | | -| Cosh | `cosh` | Yes | | -| Cot | `cot` | Yes | | -| Divide | `/` | Yes | | -| Exp | `exp` | Yes | | -| Expm1 | `expm1` | Yes | | -| Floor | `floor` | Yes | | -| Hex | `hex` | Yes | | -| IntegralDivide | `div` | Yes | | -| IsNaN | `isnan` | Yes | | -| Log | `log` | Yes | | -| Log2 | `log2` | Yes | | -| Log10 | `log10` | Yes | | -| Multiply | `*` | Yes | | -| Pow | `power` | Yes | | -| Rand | `rand` | Yes | | -| Randn | `randn` | Yes | | -| Remainder | `%` | Yes | | -| Round | `round` | Yes | | -| Signum | `signum` | Yes | | -| Sin | `sin` | Yes | | -| Sinh | `sinh` | Yes | | -| Sqrt | `sqrt` | Yes | | -| Subtract | `-` | Yes | | -| Tan | `tan` | Yes | | -| Tanh | `tanh` | Yes | | -| TryAdd | `try_add` | Yes | Only integer inputs are supported | -| TryDivide | `try_div` | Yes | Only integer inputs are supported | -| TryMultiply | `try_mul` | Yes | Only integer inputs are supported | -| TrySubtract | `try_sub` | Yes | Only integer inputs are supported | -| UnaryMinus | `-` | Yes | | -| Unhex | `unhex` | Yes | | +| Expression | SQL | +| -------------- | --------- | +| Abs | `abs` | +| Acos | `acos` | +| Add | `+` | +| Asin | `asin` | +| Atan | `atan` | +| Atan2 | `atan2` | +| BRound | `bround` | +| Ceil | `ceil` | +| Cos | `cos` | +| Cosh | `cosh` | +| Cot | `cot` | +| Divide | `/` | +| Exp | `exp` | +| Expm1 | `expm1` | +| Floor | `floor` | +| Hex | `hex` | +| IntegralDivide | `div` | +| IsNaN | `isnan` | +| Log | `log` | +| Log2 | `log2` | +| Log10 | `log10` | +| Multiply | `*` | +| Pow | `power` | +| Rand | `rand` | +| Randn | `randn` | +| Remainder | `%` | +| Round | `round` | +| Signum | `signum` | +| Sin | `sin` | +| Sinh | `sinh` | +| Sqrt | `sqrt` | +| Subtract | `-` | +| Tan | `tan` | +| Tanh | `tanh` | +| TryAdd | `try_add` | +| TryDivide | `try_div` | +| TryMultiply | `try_mul` | +| TrySubtract | `try_sub` | +| UnaryMinus | `-` | +| Unhex | `unhex` | ## Hashing Functions -| Expression | Spark-Compatible? | -| ----------- | ----------------- | -| Md5 | Yes | -| Murmur3Hash | Yes | -| Sha1 | Yes | -| Sha2 | Yes | -| XxHash64 | Yes | +| Expression | +| ----------- | +| Md5 | +| Murmur3Hash | +| Sha1 | +| Sha2 | +| XxHash64 | ## Bitwise Expressions -| Expression | SQL | Spark-Compatible? | -| ------------ | ---- | ----------------- | -| BitwiseAnd | `&` | Yes | -| BitwiseCount | | Yes | -| BitwiseGet | | Yes | -| BitwiseOr | `\|` | Yes | -| BitwiseNot | `~` | Yes | -| BitwiseXor | `^` | Yes | -| ShiftLeft | `<<` | Yes | -| ShiftRight | `>>` | Yes | +| Expression | SQL | +| ------------ | ---- | +| BitwiseAnd | `&` | +| BitwiseCount | | +| BitwiseGet | | +| BitwiseOr | `\|` | +| BitwiseNot | `~` | +| BitwiseXor | `^` | +| ShiftLeft | `<<` | +| ShiftRight | `>>` | ## Aggregate Expressions -| Expression | SQL | Spark-Compatible? | Compatibility Notes | -| ------------- | ---------- | ------------------------- | ---------------------------------------------------------------- | -| Average | | Yes, except for ANSI mode | | -| BitAndAgg | | Yes | | -| BitOrAgg | | Yes | | -| BitXorAgg | | Yes | | -| BoolAnd | `bool_and` | Yes | | -| BoolOr | `bool_or` | Yes | | -| CollectSet | | No | NaN dedup differs from Spark. See compatibility guide. | -| Corr | | Yes | | -| Count | | Yes | | -| CovPopulation | | Yes | | -| CovSample | | Yes | | -| First | | No | This function is not deterministic. Results may not match Spark. | -| Last | | No | This function is not deterministic. Results may not match Spark. | -| Max | | Yes | | -| Min | | Yes | | -| StddevPop | | Yes | | -| StddevSamp | | Yes | | -| Sum | | Yes, except for ANSI mode | | -| VariancePop | | Yes | | -| VarianceSamp | | Yes | | +| Expression | SQL | +| ------------- | ---------- | +| Average | | +| BitAndAgg | | +| BitOrAgg | | +| BitXorAgg | | +| BoolAnd | `bool_and` | +| BoolOr | `bool_or` | +| CollectSet | | +| Corr | | +| Count | | +| CovPopulation | | +| CovSample | | +| First | | +| Last | | +| Max | | +| Min | | +| StddevPop | | +| StddevSamp | | +| Sum | | +| VariancePop | | +| VarianceSamp | | ## Window Functions @@ -226,94 +222,94 @@ Window support is disabled by default due to known correctness issues. Tracking Comet supports using the following aggregate functions within window contexts with PARTITION BY and ORDER BY clauses. -| Expression | Spark-Compatible? | Compatibility Notes | -| ---------- | ----------------- | ------------------- | -| Count | Yes | | -| Max | Yes | | -| Min | Yes | | -| Sum | Yes | | +| Expression | +| ---------- | +| Count | +| Max | +| Min | +| Sum | **Note:** Dedicated window functions such as `rank`, `dense_rank`, `row_number`, `lag`, `lead`, `ntile`, `cume_dist`, `percent_rank`, and `nth_value` are not currently supported and will fall back to Spark. ## Array Expressions -| Expression | Spark-Compatible? | Compatibility Notes | -| -------------- | ----------------- | ----------------------------------------------------- | -| ArrayAppend | Yes | | -| ArrayCompact | No | | -| ArrayContains | Yes | | -| ArrayDistinct | Yes | | -| ArrayExcept | No | | -| ArrayFilter | Yes | Only supports case where function is `IsNotNull` | -| ArrayInsert | No | | -| ArrayIntersect | No | | -| ArrayJoin | No | | -| ArrayMax | Yes | | -| ArrayMin | Yes | | -| ArrayRemove | Yes | | -| ArrayRepeat | No | | -| ArrayUnion | Yes | | -| ArraysOverlap | Yes | | -| CreateArray | Yes | | -| ElementAt | Yes | Input must be an array. Map inputs are not supported. | -| Flatten | Yes | | -| GetArrayItem | Yes | | +| Expression | +| -------------- | +| ArrayAppend | +| ArrayCompact | +| ArrayContains | +| ArrayDistinct | +| ArrayExcept | +| ArrayFilter | +| ArrayInsert | +| ArrayIntersect | +| ArrayJoin | +| ArrayMax | +| ArrayMin | +| ArrayRemove | +| ArrayRepeat | +| ArrayUnion | +| ArraysOverlap | +| CreateArray | +| ElementAt | +| Flatten | +| GetArrayItem | ## Map Expressions -| Expression | Spark-Compatible? | -| ------------- | ----------------- | -| GetMapValue | Yes | -| MapKeys | Yes | -| MapEntries | Yes | -| MapValues | Yes | -| MapFromArrays | Yes | +| Expression | +| ------------- | +| GetMapValue | +| MapKeys | +| MapEntries | +| MapValues | +| MapFromArrays | ## Struct Expressions -| Expression | Spark-Compatible? | Compatibility Notes | -| -------------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------- | -| CreateNamedStruct | Yes | | -| GetArrayStructFields | Yes | | -| GetStructField | Yes | | -| JsonToStructs | No | Partial support. Requires explicit schema. | -| StructsToJson | No | Does not support Infinity/-Infinity for numeric types ([#3016](https://github.com/apache/datafusion-comet/issues/3016)) | +| Expression | +| -------------------- | +| CreateNamedStruct | +| GetArrayStructFields | +| GetStructField | +| JsonToStructs | +| StructsToJson | ## Conversion Expressions -| Expression | Spark-Compatible | Compatibility Notes | -| ---------- | ------------------------ | ------------------------------------------------------------------------------------------- | -| Cast | Depends on specific cast | See the [Comet Compatibility Guide] for list of supported cast expressions and known issues | +| Expression | +| ---------- | +| Cast | ## SortOrder -| Expression | Spark-Compatible? | Compatibility Notes | -| ---------- | ----------------- | ------------------- | -| NullsFirst | Yes | | -| NullsLast | Yes | | -| Ascending | Yes | | -| Descending | Yes | | +| Expression | +| ---------- | +| NullsFirst | +| NullsLast | +| Ascending | +| Descending | ## Other -| Expression | Spark-Compatible? | Compatibility Notes | -| ---------------------------- | ----------------- | --------------------------------------------------------------------------- | -| Alias | Yes | | -| AttributeReference | Yes | | -| BloomFilterMightContain | Yes | | -| Coalesce | Yes | | -| CheckOverflow | Yes | | -| KnownFloatingPointNormalized | Yes | | -| Literal | Yes | | -| MakeDecimal | Yes | | -| MonotonicallyIncreasingID | Yes | | -| NormalizeNaNAndZero | Yes | | -| PromotePrecision | Yes | | -| RegExpReplace | No | Uses Rust regexp engine, which has different behavior to Java regexp engine | -| ScalarSubquery | Yes | | -| SparkPartitionID | Yes | | -| ToPrettyString | Yes | | -| UnscaledValue | Yes | | +| Expression | +| ---------------------------- | +| Alias | +| AttributeReference | +| BloomFilterMightContain | +| Coalesce | +| CheckOverflow | +| KnownFloatingPointNormalized | +| Literal | +| MakeDecimal | +| MonotonicallyIncreasingID | +| NormalizeNaNAndZero | +| PromotePrecision | +| RegExpReplace | +| ScalarSubquery | +| SparkPartitionID | +| ToPrettyString | +| UnscaledValue | [Comet Configuration Guide]: configs.md [Comet Compatibility Guide]: compatibility/expressions/index.md diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 2360508c6c..08f0f86917 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -39,8 +39,10 @@ object GenerateDocs { private val publicConfigs: Set[ConfigEntry[_]] = CometConf.allConfs.filter(_.isPublic).toSet - /** (expression class simple name, incompatible reasons, unsupported reasons) */ - private type CategoryNotes = Seq[(String, Seq[String], Seq[String])] + /** + * (expression class simple name, compatible notes, incompatible reasons, unsupported reasons) + */ + private type CategoryNotes = Seq[(String, Seq[String], Seq[String], Seq[String])] /** * Mapping from expression category to the compatibility guide page where that category's @@ -51,42 +53,74 @@ object GenerateDocs { "array" -> ("compatibility/expressions/array.md", () => QueryPlanSerde.arrayExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "datetime" -> ("compatibility/expressions/datetime.md", () => QueryPlanSerde.temporalExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "math" -> ("compatibility/expressions/math.md", () => QueryPlanSerde.mathExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "struct" -> ("compatibility/expressions/struct.md", () => QueryPlanSerde.structExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "aggregate" -> ("compatibility/expressions/aggregate.md", () => QueryPlanSerde.aggrSerdeMap.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "string" -> ("compatibility/expressions/string.md", () => QueryPlanSerde.stringExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "map" -> ("compatibility/expressions/map.md", () => QueryPlanSerde.mapExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "misc" -> ("compatibility/expressions/misc.md", () => QueryPlanSerde.miscExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) })) def main(args: Array[String]): Unit = { @@ -191,11 +225,19 @@ object GenerateDocs { } private def writeExpressionCompatNotes(w: BufferedOutputStream, notes: CategoryNotes): Unit = { - val sorted = notes.sortBy(_._1).filter { case (_, incompat, unsupported) => - incompat.nonEmpty || unsupported.nonEmpty + val sorted = notes.sortBy(_._1).filter { case (_, compat, incompat, unsupported) => + compat.nonEmpty || incompat.nonEmpty || unsupported.nonEmpty } - for ((name, incompat, unsupported) <- sorted) { + for ((name, compat, incompat, unsupported) <- sorted) { w.write(s"\n### $name\n".getBytes) + if (compat.nonEmpty) { + w.write( + ("\nThe following differences from Spark are always present and do not require" + + " any additional configuration:\n\n").getBytes) + for (note <- compat) { + w.write(s"- $note\n".getBytes) + } + } if (incompat.nonEmpty) { w.write( (s"\nThe following incompatibilities cause `$name` to fall back to Spark by default." + diff --git a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala index ba220dcce8..316510400b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala @@ -39,6 +39,18 @@ trait CometAggregateExpressionSerde[T <: AggregateFunction] { */ def getExprConfigName(expr: T): String = expr.getClass.getSimpleName + /** + * Get documentation notes about ways this expression may differ from Spark that do not require + * the user to opt in via `spark.comet.expr.allowIncompatible`. Use this for differences that + * are always present, such as non-determinism or locale-specific behavior. This is called from + * GenerateDocs when generating the Compatibility Guide. Each note should be written in Markdown + * and may span multiple lines. + * + * @return + * List of notes, defaulting to an empty list. + */ + def getCompatibleNotes(): Seq[String] = Seq.empty + /** * Get documentation for usages where this expression may be incompatible with Spark. This is * called from GenerateDocs when generating the Compatibility Guide. Each reason should be diff --git a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala index afad6f6bb2..e015de879a 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala @@ -37,6 +37,18 @@ trait CometExpressionSerde[T <: Expression] { */ def getExprConfigName(expr: T): String = expr.getClass.getSimpleName + /** + * Get documentation notes about ways this expression may differ from Spark that do not require + * the user to opt in via `spark.comet.expr.allowIncompatible`. Use this for differences that + * are always present, such as non-determinism or locale-specific behavior. This is called from + * GenerateDocs when generating the Compatibility Guide. Each note should be written in Markdown + * and may span multiple lines. + * + * @return + * List of notes, defaulting to an empty list. + */ + def getCompatibleNotes(): Seq[String] = Seq.empty + /** * Get documentation for usages where this expression may be incompatible with Spark. This is * called from GenerateDocs when generating the Compatibility Guide. Each reason should be diff --git a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala index 549776e7c3..6889fc02d9 100644 --- a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala +++ b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala @@ -205,6 +205,8 @@ object CometAverage extends CometAggregateExpressionSerde[Average] { object CometSum extends CometAggregateExpressionSerde[Sum] { + override def getIncompatibleReasons(): Seq[String] = Seq("Falls back to Spark in ANSI mode.") + override def convert( aggExpr: AggregateExpression, sum: Sum, @@ -245,6 +247,10 @@ object CometSum extends CometAggregateExpressionSerde[Sum] { } object CometFirst extends CometAggregateExpressionSerde[First] { + + override def getCompatibleNotes(): Seq[String] = Seq( + "This function is not deterministic. Results may not match Spark.") + override def convert( aggExpr: AggregateExpression, first: First, @@ -277,6 +283,10 @@ object CometFirst extends CometAggregateExpressionSerde[First] { } object CometLast extends CometAggregateExpressionSerde[Last] { + + override def getCompatibleNotes(): Seq[String] = Seq( + "This function is not deterministic. Results may not match Spark.") + override def convert( aggExpr: AggregateExpression, last: Last, diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index cd037e9fd9..ed4ee4265a 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -546,6 +546,9 @@ object CometArrayReverse extends CometExpressionSerde[Reverse] with ArraysBase { object CometElementAt extends CometExpressionSerde[ElementAt] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Input must be an array. `Map` inputs are not supported.") + override def convert( expr: ElementAt, inputs: Seq[Attribute], diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 5355edf415..968fe8cd69 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -33,6 +33,10 @@ import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInter object CometStringRepeat extends CometExpressionSerde[StringRepeat] { + override def getCompatibleNotes(): Seq[String] = Seq( + "A negative argument for the number of times to repeat throws an exception" + + " instead of returning an empty string as Spark does") + override def convert( expr: StringRepeat, inputs: Seq[Attribute], @@ -50,6 +54,10 @@ object CometStringRepeat extends CometExpressionSerde[StringRepeat] { class CometCaseConversionBase[T <: Expression](function: String) extends CometScalarFunction[T](function) { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Results can vary depending on locale and character set." + + s" Requires `${CometConf.COMET_CASE_CONVERSION_ENABLED.key}=true` to enable.") + override def convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { if (!CometConf.COMET_CASE_CONVERSION_ENABLED.get()) { withInfo( @@ -123,6 +131,10 @@ object CometSubstring extends CometExpressionSerde[Substring] { object CometLeft extends CometExpressionSerde[Left] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `BinaryType` and `StringType` input", + "The length argument must be a literal value") + override def convert(expr: Left, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { expr.len match { case Literal(lenValue, _) => @@ -143,9 +155,6 @@ object CometLeft extends CometExpressionSerde[Left] { } } - override def getUnsupportedReasons(): Seq[String] = Seq( - "Only supports `BinaryType` and `StringType` input") - override def getSupportLevel(expr: Left): SupportLevel = { expr.str.dataType match { case _: BinaryType | _: StringType => Compatible() @@ -254,6 +263,9 @@ object CometLike extends CometExpressionSerde[Like] { object CometRLike extends CometExpressionSerde[RLike] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Uses Rust regexp engine, which has different behavior to Java regexp engine") + override def convert(expr: RLike, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { expr.right match { case Literal(pattern, DataTypes.StringType) => diff --git a/spark/src/main/scala/org/apache/comet/serde/structs.scala b/spark/src/main/scala/org/apache/comet/serde/structs.scala index 4406f68114..688d27f91f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/structs.scala +++ b/spark/src/main/scala/org/apache/comet/serde/structs.scala @@ -184,6 +184,8 @@ object CometJsonToStructs extends CometExpressionSerde[JsonToStructs] { override def getIncompatibleReasons(): Seq[String] = Seq( "Partially implemented and not comprehensively tested") + override def getUnsupportedReasons(): Seq[String] = Seq("Requires an explicit schema") + override def getSupportLevel(expr: JsonToStructs): SupportLevel = { // this feature is partially implemented and not comprehensively tested yet Incompatible()