diff --git a/docs/source/contributor-guide/adding_a_new_expression.md b/docs/source/contributor-guide/adding_a_new_expression.md index c4ce4b377d..57131d0091 100644 --- a/docs/source/contributor-guide/adding_a_new_expression.md +++ b/docs/source/contributor-guide/adding_a_new_expression.md @@ -73,9 +73,9 @@ object CometUnhex extends CometExpressionSerde[Unhex] { The `CometExpressionSerde` trait provides several methods you can override: - `convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr]` - **Required**. Converts the Spark expression to protobuf. Return `None` if the expression cannot be converted. -- `getSupportLevel(expr: T): SupportLevel` - Optional. Returns the level of support for the expression at planning time, based on a specific expression instance. See "Using getSupportLevel" section below for details. -- `getIncompatibleReasons(): Seq[String]` - Optional. Returns reasons why this expression may produce different results than Spark. Used to generate the Compatibility Guide. See "Documenting Incompatible and Unsupported Reasons" below. -- `getUnsupportedReasons(): Seq[String]` - Optional. Returns reasons why this expression may not be supported by Comet (for example, unsupported data types or format strings). Used to generate the Compatibility Guide. See "Documenting Incompatible and Unsupported Reasons" below. +- `getSupportLevel(expr: T): SupportLevel` - Optional. Returns the [support level](#support-levels) for the expression at planning time, based on a specific expression instance. See [Using getSupportLevel](#using-getsupportlevel) below for details. +- `getIncompatibleReasons(): Seq[String]` - Optional. Returns reasons why this expression may produce different results than Spark. Used to generate the Compatibility Guide. See [Documenting Incompatible and Unsupported Reasons](#documenting-incompatible-and-unsupported-reasons) below. +- `getUnsupportedReasons(): Seq[String]` - Optional. Returns reasons why this expression may not be supported by Comet (for example, unsupported data types or format strings). Used to generate the Compatibility Guide. See [Documenting Incompatible and Unsupported Reasons](#documenting-incompatible-and-unsupported-reasons) below. - `getExprConfigName(expr: T): String` - Optional. Returns a short name for configuration keys. Defaults to the Spark class name. For simple scalar functions that map directly to a DataFusion function, you can use the built-in `CometScalarFunction` implementation: @@ -103,6 +103,16 @@ A few things to note: - `scalarFunctionExprToProtoWithReturnType` is for scalar functions that need to return type information. Your expression may use a different method depending on the type of expression. - Use helper methods like `createBinaryExpr` and `createUnaryExpr` from `QueryPlanSerde` for common expression patterns. +#### Support Levels + +The `SupportLevel` sealed trait has three possible values: + +- **`Compatible(notes: Option[String] = None)`** - Comet supports this expression with full compatibility with Spark, or may have known differences in specific edge cases unlikely to affect most users. This is the default if you don't override `getSupportLevel`. +- **`Incompatible(notes: Option[String] = None)`** - Comet supports this expression but results can differ from Spark. The expression will only be used if `spark.comet.expr.allowIncompatible=true` or the expression-specific config `spark.comet.expr..allowIncompatible=true` is set. +- **`Unsupported(notes: Option[String] = None)`** - Comet does not support this expression under the current conditions. Spark will fall back to its native execution. + +All three accept an optional `notes` parameter to provide additional context that is logged for debugging. + #### Using getSupportLevel The `getSupportLevel` method allows you to control whether an expression should be executed by Comet based on various conditions such as data types, parameter values, or other expression-specific constraints. This is particularly useful when: @@ -111,14 +121,6 @@ The `getSupportLevel` method allows you to control whether an expression should 2. Your expression has known incompatibilities with Spark's behavior 3. Your expression has edge cases that aren't yet supported -The method returns one of three `SupportLevel` values: - -- **`Compatible(notes: Option[String] = None)`** - Comet supports this expression with full compatibility with Spark, or may have known differences in specific edge cases that are unlikely to be an issue for most users. This is the default if you don't override `getSupportLevel`. -- **`Incompatible(notes: Option[String] = None)`** - Comet supports this expression but results can be different from Spark. The expression will only be used if `spark.comet.expr.allowIncompatible=true` or the expression-specific config `spark.comet.expr..allowIncompatible=true` is set. -- **`Unsupported(notes: Option[String] = None)`** - Comet does not support this expression under the current conditions. The expression will not be used and Spark will fall back to its native execution. - -All three support levels accept an optional `notes` parameter to provide additional context about the support level. - ##### Examples **Example 1: Restricting to specific data types** diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md b/docs/source/user-guide/latest/compatibility/expressions/index.md index 2c4742f36d..9fbec44f0b 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/index.md @@ -31,7 +31,10 @@ Compatibility notes are grouped by expression category: aggregate array datetime +map math +misc +string struct cast ``` diff --git a/docs/source/user-guide/latest/compatibility/expressions/map.md b/docs/source/user-guide/latest/compatibility/expressions/map.md new file mode 100644 index 0000000000..c8e2fdba2a --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/map.md @@ -0,0 +1,23 @@ + + +# Map Expressions + + + diff --git a/docs/source/user-guide/latest/compatibility/expressions/misc.md b/docs/source/user-guide/latest/compatibility/expressions/misc.md new file mode 100644 index 0000000000..58d9a0e122 --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/misc.md @@ -0,0 +1,23 @@ + + +# Miscellaneous Expressions + + + diff --git a/docs/source/user-guide/latest/compatibility/expressions/string.md b/docs/source/user-guide/latest/compatibility/expressions/string.md new file mode 100644 index 0000000000..d86f299c03 --- /dev/null +++ b/docs/source/user-guide/latest/compatibility/expressions/string.md @@ -0,0 +1,23 @@ + + +# String Expressions + + + diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 40610d05ac..f99f60141c 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -19,204 +19,200 @@ # Supported Spark Expressions -Comet supports the following Spark expressions. Expressions that are marked as Spark-compatible will either run -natively in Comet and provide the same results as Spark, or will fall back to Spark for cases that would not -be compatible. +Comet supports the following Spark expressions. See the [Comet Compatibility Guide] for details on known +incompatibilities and unsupported cases. All expressions are enabled by default, but most can be disabled by setting `spark.comet.expression.EXPRNAME.enabled=false`, where `EXPRNAME` is the expression name as specified in the following tables, such as `Length`, or `StartsWith`. See the [Comet Configuration Guide] for a full list of expressions that be disabled. -Expressions that are not Spark-compatible will fall back to Spark by default and can be enabled by setting -`spark.comet.expression.EXPRNAME.allowIncompatible=true`. - ## Conditional Expressions -| Expression | SQL | Spark-Compatible? | -| ---------- | ------------------------------------------- | ----------------- | -| CaseWhen | `CASE WHEN expr THEN expr ELSE expr END` | Yes | -| If | `IF(predicate_expr, true_expr, false_expr)` | Yes | +| Expression | SQL | +| ---------- | ------------------------------------------- | +| CaseWhen | `CASE WHEN expr THEN expr ELSE expr END` | +| If | `IF(predicate_expr, true_expr, false_expr)` | ## Predicate Expressions -| Expression | SQL | Spark-Compatible? | -| ------------------ | ------------- | ----------------- | -| And | `AND` | Yes | -| EqualTo | `=` | Yes | -| EqualNullSafe | `<=>` | Yes | -| GreaterThan | `>` | Yes | -| GreaterThanOrEqual | `>=` | Yes | -| LessThan | `<` | Yes | -| LessThanOrEqual | `<=` | Yes | -| In | `IN` | Yes | -| IsNotNull | `IS NOT NULL` | Yes | -| IsNull | `IS NULL` | Yes | -| InSet | `IN (...)` | Yes | -| Not | `NOT` | Yes | -| Or | `OR` | Yes | +| Expression | SQL | +| ------------------ | ------------- | +| And | `AND` | +| EqualTo | `=` | +| EqualNullSafe | `<=>` | +| GreaterThan | `>` | +| GreaterThanOrEqual | `>=` | +| LessThan | `<` | +| LessThanOrEqual | `<=` | +| In | `IN` | +| IsNotNull | `IS NOT NULL` | +| IsNull | `IS NULL` | +| InSet | `IN (...)` | +| Not | `NOT` | +| Or | `OR` | ## String Functions -| Expression | Spark-Compatible? | Compatibility Notes | -| --------------- | ----------------- | ---------------------------------------------------------------------------------------------------------- | -| Ascii | Yes | | -| BitLength | Yes | | -| Chr | Yes | | -| Concat | Yes | Only string inputs are supported | -| ConcatWs | Yes | | -| Contains | Yes | | -| EndsWith | Yes | | -| InitCap | No | Behavior is different in some cases, such as hyphenated names. | -| Left | Yes | Length argument must be a literal value | -| Length | Yes | | -| Like | Yes | | -| Lower | No | Results can vary depending on locale and character set. Requires `spark.comet.caseConversion.enabled=true` | -| OctetLength | Yes | | -| Reverse | Yes | | -| RLike | No | Uses Rust regexp engine, which has different behavior to Java regexp engine | -| StartsWith | Yes | | -| StringInstr | Yes | | -| StringRepeat | Yes | Negative argument for number of times to repeat causes exception | -| StringReplace | Yes | | -| StringLPad | Yes | | -| StringRPad | Yes | | -| StringSpace | Yes | | -| StringTranslate | Yes | | -| StringTrim | Yes | | -| StringTrimBoth | Yes | | -| StringTrimLeft | Yes | | -| StringTrimRight | Yes | | -| Substring | Yes | | -| Upper | No | Results can vary depending on locale and character set. Requires `spark.comet.caseConversion.enabled=true` | +| Expression | +| --------------- | +| Ascii | +| BitLength | +| Chr | +| Concat | +| ConcatWs | +| Contains | +| EndsWith | +| InitCap | +| Left | +| Length | +| Like | +| Lower | +| OctetLength | +| Reverse | +| RLike | +| StartsWith | +| StringInstr | +| StringRepeat | +| StringReplace | +| StringLPad | +| StringRPad | +| StringSpace | +| StringTranslate | +| StringTrim | +| StringTrimBoth | +| StringTrimLeft | +| StringTrimRight | +| Substring | +| Upper | ## JSON Functions -| Expression | Spark-Compatible? | Compatibility Notes | -| ------------- | ----------------- | --------------------------------------------------------------------------------------------- | -| GetJsonObject | No | Spark allows single-quoted JSON and unescaped control characters which Comet does not support | +| Expression | +| ------------- | +| GetJsonObject | ## Date/Time Functions -| Expression | SQL | Spark-Compatible? | Compatibility Notes | -| -------------- | ---------------------------- | ----------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| DateAdd | `date_add` | Yes | | -| DateDiff | `datediff` | Yes | | -| DateFormat | `date_format` | Yes | Partial support. Only specific format patterns are supported. | -| DateSub | `date_sub` | Yes | | -| DatePart | `date_part(field, source)` | Yes | Supported values of `field`: `year`/`month`/`week`/`day`/`dayofweek`/`dayofweek_iso`/`doy`/`quarter`/`hour`/`minute` | -| Days | `days` | Yes | V2 partition transform. Supports DateType and TimestampType inputs. | -| Extract | `extract(field FROM source)` | Yes | Supported values of `field`: `year`/`month`/`week`/`day`/`dayofweek`/`dayofweek_iso`/`doy`/`quarter`/`hour`/`minute` | -| FromUnixTime | `from_unixtime` | No | Does not support format, supports only -8334601211038 <= sec <= 8210266876799 | -| Hour | `hour` | No | Incorrectly applies timezone conversion to TimestampNTZ inputs ([#3180](https://github.com/apache/datafusion-comet/issues/3180)) | -| LastDay | `last_day` | Yes | | -| Minute | `minute` | No | Incorrectly applies timezone conversion to TimestampNTZ inputs ([#3180](https://github.com/apache/datafusion-comet/issues/3180)) | -| Second | `second` | No | Incorrectly applies timezone conversion to TimestampNTZ inputs ([#3180](https://github.com/apache/datafusion-comet/issues/3180)) | -| TruncDate | `trunc` | Yes | | -| TruncTimestamp | `date_trunc` | No | Incorrect results in non-UTC timezones ([#2649](https://github.com/apache/datafusion-comet/issues/2649)) | -| UnixDate | `unix_date` | Yes | | -| UnixTimestamp | `unix_timestamp` | Yes | | -| Year | `year` | Yes | | -| Month | `month` | Yes | | -| DayOfMonth | `day`/`dayofmonth` | Yes | | -| DayOfWeek | `dayofweek` | Yes | | -| WeekDay | `weekday` | Yes | | -| DayOfYear | `dayofyear` | Yes | | -| WeekOfYear | `weekofyear` | Yes | | -| Quarter | `quarter` | Yes | | +| Expression | SQL | +| -------------- | ---------------------------- | +| DateAdd | `date_add` | +| DateDiff | `datediff` | +| DateFormat | `date_format` | +| DateSub | `date_sub` | +| DatePart | `date_part(field, source)` | +| Days | `days` | +| Extract | `extract(field FROM source)` | +| FromUnixTime | `from_unixtime` | +| Hour | `hour` | +| LastDay | `last_day` | +| Minute | `minute` | +| Second | `second` | +| TruncDate | `trunc` | +| TruncTimestamp | `date_trunc` | +| UnixDate | `unix_date` | +| UnixTimestamp | `unix_timestamp` | +| Year | `year` | +| Month | `month` | +| DayOfMonth | `day`/`dayofmonth` | +| DayOfWeek | `dayofweek` | +| WeekDay | `weekday` | +| DayOfYear | `dayofyear` | +| WeekOfYear | `weekofyear` | +| Quarter | `quarter` | ## Math Expressions -| Expression | SQL | Spark-Compatible? | Compatibility Notes | -| -------------- | --------- | ----------------- | --------------------------------- | -| Abs | `abs` | Yes | | -| Acos | `acos` | Yes | | -| Add | `+` | Yes | | -| Asin | `asin` | Yes | | -| Atan | `atan` | Yes | | -| Atan2 | `atan2` | Yes | | -| BRound | `bround` | Yes | | -| Ceil | `ceil` | Yes | | -| Cos | `cos` | Yes | | -| Cosh | `cosh` | Yes | | -| Cot | `cot` | Yes | | -| Divide | `/` | Yes | | -| Exp | `exp` | Yes | | -| Expm1 | `expm1` | Yes | | -| Floor | `floor` | Yes | | -| Hex | `hex` | Yes | | -| IntegralDivide | `div` | Yes | | -| IsNaN | `isnan` | Yes | | -| Log | `log` | Yes | | -| Log2 | `log2` | Yes | | -| Log10 | `log10` | Yes | | -| Multiply | `*` | Yes | | -| Pow | `power` | Yes | | -| Rand | `rand` | Yes | | -| Randn | `randn` | Yes | | -| Remainder | `%` | Yes | | -| Round | `round` | Yes | | -| Signum | `signum` | Yes | | -| Sin | `sin` | Yes | | -| Sinh | `sinh` | Yes | | -| Sqrt | `sqrt` | Yes | | -| Subtract | `-` | Yes | | -| Tan | `tan` | Yes | | -| Tanh | `tanh` | Yes | | -| TryAdd | `try_add` | Yes | Only integer inputs are supported | -| TryDivide | `try_div` | Yes | Only integer inputs are supported | -| TryMultiply | `try_mul` | Yes | Only integer inputs are supported | -| TrySubtract | `try_sub` | Yes | Only integer inputs are supported | -| UnaryMinus | `-` | Yes | | -| Unhex | `unhex` | Yes | | +| Expression | SQL | +| -------------- | --------- | +| Abs | `abs` | +| Acos | `acos` | +| Add | `+` | +| Asin | `asin` | +| Atan | `atan` | +| Atan2 | `atan2` | +| BRound | `bround` | +| Ceil | `ceil` | +| Cos | `cos` | +| Cosh | `cosh` | +| Cot | `cot` | +| Divide | `/` | +| Exp | `exp` | +| Expm1 | `expm1` | +| Floor | `floor` | +| Hex | `hex` | +| IntegralDivide | `div` | +| IsNaN | `isnan` | +| Log | `log` | +| Log2 | `log2` | +| Log10 | `log10` | +| Multiply | `*` | +| Pow | `power` | +| Rand | `rand` | +| Randn | `randn` | +| Remainder | `%` | +| Round | `round` | +| Signum | `signum` | +| Sin | `sin` | +| Sinh | `sinh` | +| Sqrt | `sqrt` | +| Subtract | `-` | +| Tan | `tan` | +| Tanh | `tanh` | +| TryAdd | `try_add` | +| TryDivide | `try_div` | +| TryMultiply | `try_mul` | +| TrySubtract | `try_sub` | +| UnaryMinus | `-` | +| Unhex | `unhex` | ## Hashing Functions -| Expression | Spark-Compatible? | -| ----------- | ----------------- | -| Md5 | Yes | -| Murmur3Hash | Yes | -| Sha1 | Yes | -| Sha2 | Yes | -| XxHash64 | Yes | +| Expression | +| ----------- | +| Md5 | +| Murmur3Hash | +| Sha1 | +| Sha2 | +| XxHash64 | ## Bitwise Expressions -| Expression | SQL | Spark-Compatible? | -| ------------ | ---- | ----------------- | -| BitwiseAnd | `&` | Yes | -| BitwiseCount | | Yes | -| BitwiseGet | | Yes | -| BitwiseOr | `\|` | Yes | -| BitwiseNot | `~` | Yes | -| BitwiseXor | `^` | Yes | -| ShiftLeft | `<<` | Yes | -| ShiftRight | `>>` | Yes | +| Expression | SQL | +| ------------ | ---- | +| BitwiseAnd | `&` | +| BitwiseCount | | +| BitwiseGet | | +| BitwiseOr | `\|` | +| BitwiseNot | `~` | +| BitwiseXor | `^` | +| ShiftLeft | `<<` | +| ShiftRight | `>>` | ## Aggregate Expressions -| Expression | SQL | Spark-Compatible? | Compatibility Notes | -| ------------- | ---------- | ------------------------- | ---------------------------------------------------------------- | -| Average | | Yes, except for ANSI mode | | -| BitAndAgg | | Yes | | -| BitOrAgg | | Yes | | -| BitXorAgg | | Yes | | -| BoolAnd | `bool_and` | Yes | | -| BoolOr | `bool_or` | Yes | | -| CollectSet | | No | NaN dedup differs from Spark. See compatibility guide. | -| Corr | | Yes | | -| Count | | Yes | | -| CovPopulation | | Yes | | -| CovSample | | Yes | | -| First | | No | This function is not deterministic. Results may not match Spark. | -| Last | | No | This function is not deterministic. Results may not match Spark. | -| Max | | Yes | | -| Min | | Yes | | -| StddevPop | | Yes | | -| StddevSamp | | Yes | | -| Sum | | Yes, except for ANSI mode | | -| VariancePop | | Yes | | -| VarianceSamp | | Yes | | +| Expression | SQL | +| ------------- | ---------- | +| Average | | +| BitAndAgg | | +| BitOrAgg | | +| BitXorAgg | | +| BoolAnd | `bool_and` | +| BoolOr | `bool_or` | +| CollectSet | | +| Corr | | +| Count | | +| CovPopulation | | +| CovSample | | +| First | | +| Last | | +| Max | | +| Min | | +| StddevPop | | +| StddevSamp | | +| Sum | | +| VariancePop | | +| VarianceSamp | | ## Window Functions @@ -226,94 +222,94 @@ Window support is disabled by default due to known correctness issues. Tracking Comet supports using the following aggregate functions within window contexts with PARTITION BY and ORDER BY clauses. -| Expression | Spark-Compatible? | Compatibility Notes | -| ---------- | ----------------- | ------------------- | -| Count | Yes | | -| Max | Yes | | -| Min | Yes | | -| Sum | Yes | | +| Expression | +| ---------- | +| Count | +| Max | +| Min | +| Sum | **Note:** Dedicated window functions such as `rank`, `dense_rank`, `row_number`, `lag`, `lead`, `ntile`, `cume_dist`, `percent_rank`, and `nth_value` are not currently supported and will fall back to Spark. ## Array Expressions -| Expression | Spark-Compatible? | Compatibility Notes | -| -------------- | ----------------- | ----------------------------------------------------- | -| ArrayAppend | Yes | | -| ArrayCompact | No | | -| ArrayContains | Yes | | -| ArrayDistinct | Yes | | -| ArrayExcept | No | | -| ArrayFilter | Yes | Only supports case where function is `IsNotNull` | -| ArrayInsert | Yes | | -| ArrayIntersect | No | | -| ArrayJoin | No | | -| ArrayMax | Yes | | -| ArrayMin | Yes | | -| ArrayRemove | Yes | | -| ArrayRepeat | No | | -| ArrayUnion | Yes | | -| ArraysOverlap | Yes | | -| CreateArray | Yes | | -| ElementAt | Yes | Input must be an array. Map inputs are not supported. | -| Flatten | Yes | | -| GetArrayItem | Yes | | +| Expression | +| -------------- | +| ArrayAppend | +| ArrayCompact | +| ArrayContains | +| ArrayDistinct | +| ArrayExcept | +| ArrayFilter | +| ArrayInsert | +| ArrayIntersect | +| ArrayJoin | +| ArrayMax | +| ArrayMin | +| ArrayRemove | +| ArrayRepeat | +| ArrayUnion | +| ArraysOverlap | +| CreateArray | +| ElementAt | +| Flatten | +| GetArrayItem | ## Map Expressions -| Expression | Spark-Compatible? | -| ------------- | ----------------- | -| GetMapValue | Yes | -| MapKeys | Yes | -| MapEntries | Yes | -| MapValues | Yes | -| MapFromArrays | Yes | +| Expression | +| ------------- | +| GetMapValue | +| MapKeys | +| MapEntries | +| MapValues | +| MapFromArrays | ## Struct Expressions -| Expression | Spark-Compatible? | Compatibility Notes | -| -------------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------- | -| CreateNamedStruct | Yes | | -| GetArrayStructFields | Yes | | -| GetStructField | Yes | | -| JsonToStructs | No | Partial support. Requires explicit schema. | -| StructsToJson | No | Does not support Infinity/-Infinity for numeric types ([#3016](https://github.com/apache/datafusion-comet/issues/3016)) | +| Expression | +| -------------------- | +| CreateNamedStruct | +| GetArrayStructFields | +| GetStructField | +| JsonToStructs | +| StructsToJson | ## Conversion Expressions -| Expression | Spark-Compatible | Compatibility Notes | -| ---------- | ------------------------ | ------------------------------------------------------------------------------------------- | -| Cast | Depends on specific cast | See the [Comet Compatibility Guide] for list of supported cast expressions and known issues | +| Expression | +| ---------- | +| Cast | ## SortOrder -| Expression | Spark-Compatible? | Compatibility Notes | -| ---------- | ----------------- | ------------------- | -| NullsFirst | Yes | | -| NullsLast | Yes | | -| Ascending | Yes | | -| Descending | Yes | | +| Expression | +| ---------- | +| NullsFirst | +| NullsLast | +| Ascending | +| Descending | ## Other -| Expression | Spark-Compatible? | Compatibility Notes | -| ---------------------------- | ----------------- | --------------------------------------------------------------------------- | -| Alias | Yes | | -| AttributeReference | Yes | | -| BloomFilterMightContain | Yes | | -| Coalesce | Yes | | -| CheckOverflow | Yes | | -| KnownFloatingPointNormalized | Yes | | -| Literal | Yes | | -| MakeDecimal | Yes | | -| MonotonicallyIncreasingID | Yes | | -| NormalizeNaNAndZero | Yes | | -| PromotePrecision | Yes | | -| RegExpReplace | No | Uses Rust regexp engine, which has different behavior to Java regexp engine | -| ScalarSubquery | Yes | | -| SparkPartitionID | Yes | | -| ToPrettyString | Yes | | -| UnscaledValue | Yes | | +| Expression | +| ---------------------------- | +| Alias | +| AttributeReference | +| BloomFilterMightContain | +| Coalesce | +| CheckOverflow | +| KnownFloatingPointNormalized | +| Literal | +| MakeDecimal | +| MonotonicallyIncreasingID | +| NormalizeNaNAndZero | +| PromotePrecision | +| RegExpReplace | +| ScalarSubquery | +| SparkPartitionID | +| ToPrettyString | +| UnscaledValue | [Comet Configuration Guide]: configs.md [Comet Compatibility Guide]: compatibility/expressions/index.md diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index 32c8c0fdcd..08f0f86917 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -39,8 +39,10 @@ object GenerateDocs { private val publicConfigs: Set[ConfigEntry[_]] = CometConf.allConfs.filter(_.isPublic).toSet - /** (expression class simple name, incompatible reasons, unsupported reasons) */ - private type CategoryNotes = Seq[(String, Seq[String], Seq[String])] + /** + * (expression class simple name, compatible notes, incompatible reasons, unsupported reasons) + */ + private type CategoryNotes = Seq[(String, Seq[String], Seq[String], Seq[String])] /** * Mapping from expression category to the compatibility guide page where that category's @@ -51,27 +53,74 @@ object GenerateDocs { "array" -> ("compatibility/expressions/array.md", () => QueryPlanSerde.arrayExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "datetime" -> ("compatibility/expressions/datetime.md", () => QueryPlanSerde.temporalExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "math" -> ("compatibility/expressions/math.md", () => QueryPlanSerde.mathExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "struct" -> ("compatibility/expressions/struct.md", () => QueryPlanSerde.structExpressions.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) }), "aggregate" -> ("compatibility/expressions/aggregate.md", () => QueryPlanSerde.aggrSerdeMap.toSeq.map { case (cls, serde) => - (cls.getSimpleName, serde.getIncompatibleReasons(), serde.getUnsupportedReasons()) + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) + }), + "string" -> ("compatibility/expressions/string.md", + () => + QueryPlanSerde.stringExpressions.toSeq.map { case (cls, serde) => + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) + }), + "map" -> ("compatibility/expressions/map.md", + () => + QueryPlanSerde.mapExpressions.toSeq.map { case (cls, serde) => + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) + }), + "misc" -> ("compatibility/expressions/misc.md", + () => + QueryPlanSerde.miscExpressions.toSeq.map { case (cls, serde) => + ( + cls.getSimpleName, + serde.getCompatibleNotes(), + serde.getIncompatibleReasons(), + serde.getUnsupportedReasons()) })) def main(args: Array[String]): Unit = { @@ -176,11 +225,19 @@ object GenerateDocs { } private def writeExpressionCompatNotes(w: BufferedOutputStream, notes: CategoryNotes): Unit = { - val sorted = notes.sortBy(_._1).filter { case (_, incompat, unsupported) => - incompat.nonEmpty || unsupported.nonEmpty + val sorted = notes.sortBy(_._1).filter { case (_, compat, incompat, unsupported) => + compat.nonEmpty || incompat.nonEmpty || unsupported.nonEmpty } - for ((name, incompat, unsupported) <- sorted) { + for ((name, compat, incompat, unsupported) <- sorted) { w.write(s"\n### $name\n".getBytes) + if (compat.nonEmpty) { + w.write( + ("\nThe following differences from Spark are always present and do not require" + + " any additional configuration:\n\n").getBytes) + for (note <- compat) { + w.write(s"- $note\n".getBytes) + } + } if (incompat.nonEmpty) { w.write( (s"\nThe following incompatibilities cause `$name` to fall back to Spark by default." + diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 898a4f916e..400229a402 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -48,6 +48,13 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { DataTypes.TimestampType, DataTypes.TimestampNTZType) + override def getIncompatibleReasons(): Seq[String] = Seq( + "Some cast operations between specific type pairs may produce different results than Spark." + + " Refer to the compatibility guide for the full matrix of supported cast operations.") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Not all cast type combinations are supported. Unsupported casts fall back to Spark.") + override def getSupportLevel(cast: Cast): SupportLevel = { if (cast.child.isInstanceOf[Literal]) { // casting from literal is compatible because we delegate to Spark diff --git a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala index ba220dcce8..316510400b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometAggregateExpressionSerde.scala @@ -39,6 +39,18 @@ trait CometAggregateExpressionSerde[T <: AggregateFunction] { */ def getExprConfigName(expr: T): String = expr.getClass.getSimpleName + /** + * Get documentation notes about ways this expression may differ from Spark that do not require + * the user to opt in via `spark.comet.expr.allowIncompatible`. Use this for differences that + * are always present, such as non-determinism or locale-specific behavior. This is called from + * GenerateDocs when generating the Compatibility Guide. Each note should be written in Markdown + * and may span multiple lines. + * + * @return + * List of notes, defaulting to an empty list. + */ + def getCompatibleNotes(): Seq[String] = Seq.empty + /** * Get documentation for usages where this expression may be incompatible with Spark. This is * called from GenerateDocs when generating the Compatibility Guide. Each reason should be diff --git a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala index afad6f6bb2..e015de879a 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometExpressionSerde.scala @@ -37,6 +37,18 @@ trait CometExpressionSerde[T <: Expression] { */ def getExprConfigName(expr: T): String = expr.getClass.getSimpleName + /** + * Get documentation notes about ways this expression may differ from Spark that do not require + * the user to opt in via `spark.comet.expr.allowIncompatible`. Use this for differences that + * are always present, such as non-determinism or locale-specific behavior. This is called from + * GenerateDocs when generating the Compatibility Guide. Each note should be written in Markdown + * and may span multiple lines. + * + * @return + * List of notes, defaulting to an empty list. + */ + def getCompatibleNotes(): Seq[String] = Seq.empty + /** * Get documentation for usages where this expression may be incompatible with Spark. This is * called from GenerateDocs when generating the Compatibility Guide. Each reason should be diff --git a/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala b/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala index 3390d86cee..3647645109 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala @@ -27,6 +27,10 @@ import org.apache.comet.serde.QueryPlanSerde.exprToProtoInternal object CometSortOrder extends CometExpressionSerde[SortOrder] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "When `" + CometConf.COMET_EXEC_STRICT_FLOATING_POINT.key + "=true`, sorting on" + + " floating-point types is not 100% compatible with Spark") + override def getSupportLevel(expr: SortOrder): SupportLevel = { if (CometConf.COMET_EXEC_STRICT_FLOATING_POINT.get() && diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 1d672edeea..c3dc6dcfd5 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -128,7 +128,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[Abs] -> CometAbs, classOf[Bin] -> CometScalarFunction("bin")) - private val mapExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + private[comet] val mapExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[GetMapValue] -> CometMapExtract, classOf[MapKeys] -> CometMapKeys, classOf[MapEntries] -> CometMapEntries, @@ -232,7 +232,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Cast] -> CometCast) - private val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + private[comet] val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( // TODO PromotePrecision classOf[Alias] -> CometAlias, classOf[AttributeReference] -> CometAttributeReference, diff --git a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala index 549776e7c3..6889fc02d9 100644 --- a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala +++ b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala @@ -205,6 +205,8 @@ object CometAverage extends CometAggregateExpressionSerde[Average] { object CometSum extends CometAggregateExpressionSerde[Sum] { + override def getIncompatibleReasons(): Seq[String] = Seq("Falls back to Spark in ANSI mode.") + override def convert( aggExpr: AggregateExpression, sum: Sum, @@ -245,6 +247,10 @@ object CometSum extends CometAggregateExpressionSerde[Sum] { } object CometFirst extends CometAggregateExpressionSerde[First] { + + override def getCompatibleNotes(): Seq[String] = Seq( + "This function is not deterministic. Results may not match Spark.") + override def convert( aggExpr: AggregateExpression, first: First, @@ -277,6 +283,10 @@ object CometFirst extends CometAggregateExpressionSerde[First] { } object CometLast extends CometAggregateExpressionSerde[Last] { + + override def getCompatibleNotes(): Seq[String] = Seq( + "This function is not deterministic. Results may not match Spark.") + override def convert( aggExpr: AggregateExpression, last: Last, diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index 28bce4cec9..50833c00ca 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -57,8 +57,6 @@ object CometArrayRemove object CometArrayAppend extends CometExpressionSerde[ArrayAppend] { - override def getSupportLevel(expr: ArrayAppend): SupportLevel = Compatible() - override def convert( expr: ArrayAppend, inputs: Seq[Attribute], @@ -123,6 +121,10 @@ object CometArrayContains extends CometExpressionSerde[ArrayContains] { object CometSortArray extends CometExpressionSerde[SortArray] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "When `" + CometConf.COMET_EXEC_STRICT_FLOATING_POINT.key + "=true`, sorting on" + + " floating-point types is not 100% compatible with Spark") + override def getUnsupportedReasons(): Seq[String] = Seq( "Nested arrays with `Struct` or `Null` child values are not supported natively and will" + " fall back to Spark.") @@ -191,6 +193,9 @@ object CometSortArray extends CometExpressionSerde[SortArray] { object CometArrayIntersect extends CometExpressionSerde[ArrayIntersect] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Null handling and ordering may differ from Spark") + override def getSupportLevel(expr: ArrayIntersect): SupportLevel = Incompatible(None) override def convert( @@ -314,6 +319,9 @@ object CometArrayCompact extends CometExpressionSerde[Expression] { object CometArrayExcept extends CometExpressionSerde[ArrayExcept] with CometExprShim { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Null handling and ordering may differ from Spark") + override def getSupportLevel(expr: ArrayExcept): SupportLevel = Incompatible(None) @tailrec @@ -353,6 +361,8 @@ object CometArrayExcept extends CometExpressionSerde[ArrayExcept] with CometExpr object CometArrayJoin extends CometExpressionSerde[ArrayJoin] { + override def getIncompatibleReasons(): Seq[String] = Seq("Null handling may differ from Spark") + override def getSupportLevel(expr: ArrayJoin): SupportLevel = Incompatible(None) override def convert( @@ -498,6 +508,8 @@ object CometGetArrayItem extends CometExpressionSerde[GetArrayItem] { object CometArrayReverse extends CometExpressionSerde[Reverse] with ArraysBase { val unsupportedReason = "reverse on array containing binary is not supported" + override def getIncompatibleReasons(): Seq[String] = Seq(unsupportedReason) + @tailrec private def containsBinary(dt: DataType): Boolean = { dt match { @@ -532,6 +544,9 @@ object CometArrayReverse extends CometExpressionSerde[Reverse] with ArraysBase { object CometElementAt extends CometExpressionSerde[ElementAt] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Input must be an array. `Map` inputs are not supported.") + override def convert( expr: ElementAt, inputs: Seq[Attribute], @@ -589,6 +604,9 @@ object CometFlatten extends CometExpressionSerde[Flatten] with ArraysBase { object CometArrayFilter extends CometExpressionSerde[ArrayFilter] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `array_filter` when the function is `IsNotNull` (used by `array_compact`)") + override def getSupportLevel(expr: ArrayFilter): SupportLevel = { expr.function.children.headOption match { case Some(_: IsNotNull) => Compatible() @@ -606,6 +624,9 @@ object CometArrayFilter extends CometExpressionSerde[ArrayFilter] { object CometSize extends CometExpressionSerde[Size] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `ArrayType` input; `MapType` input is not supported") + override def getSupportLevel(expr: Size): SupportLevel = { expr.child.dataType match { case _: ArrayType => Compatible() @@ -692,6 +713,9 @@ object CometArrayPosition extends CometExpressionSerde[ArrayPosition] with Array object CometArraysZip extends CometExpressionSerde[ArraysZip] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Not all input data types are supported; falls back to Spark for unsupported types") + private def isTypeSupported(dt: DataType): Boolean = { import DataTypes._ dt match { diff --git a/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala b/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala index 716241040e..4eb2ec6a0d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala +++ b/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala @@ -26,6 +26,9 @@ import org.apache.comet.serde.ExprOuterClass.Expr object CometReverse extends CometScalarFunction[Reverse]("reverse") { + override def getIncompatibleReasons(): Seq[String] = + CometArrayReverse.getIncompatibleReasons() + override def getSupportLevel(expr: Reverse): SupportLevel = { if (expr.child.dataType.isInstanceOf[ArrayType]) { CometArrayReverse.getSupportLevel(expr) diff --git a/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala b/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala index 62643d6da6..80a2a39ef4 100644 --- a/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala +++ b/spark/src/main/scala/org/apache/comet/serde/contraintExpressions.scala @@ -28,6 +28,9 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn object CometKnownFloatingPointNormalized extends CometExpressionSerde[KnownFloatingPointNormalized] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `NormalizeNaNAndZero` child expressions") + override def getSupportLevel(expr: KnownFloatingPointNormalized): SupportLevel = { expr.child match { case _: NormalizeNaNAndZero => Compatible() diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 52b086184b..10580aab20 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -220,6 +220,10 @@ object CometHour extends CometExpressionSerde[Hour] { object CometMinute extends CometExpressionSerde[Minute] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Incorrectly applies timezone conversion to TimestampNTZ inputs" + + " (https://github.com/apache/datafusion-comet/issues/3180)") + override def getSupportLevel(expr: Minute): SupportLevel = { if (expr.child.dataType.typeName == "timestamp_ntz") { Incompatible( @@ -258,6 +262,10 @@ object CometMinute extends CometExpressionSerde[Minute] { object CometSecond extends CometExpressionSerde[Second] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Incorrectly applies timezone conversion to TimestampNTZ inputs" + + " (https://github.com/apache/datafusion-comet/issues/3180)") + override def getSupportLevel(expr: Second): SupportLevel = { if (expr.child.dataType.typeName == "timestamp_ntz") { Incompatible( @@ -296,6 +304,11 @@ object CometSecond extends CometExpressionSerde[Second] { object CometUnixTimestamp extends CometExpressionSerde[UnixTimestamp] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only `TimestampType` and `DateType` inputs are supported." + + " `TimestampNTZType` is not supported because Comet incorrectly applies timezone" + + " conversion to TimestampNTZ values.") + private def isSupportedInputType(expr: UnixTimestamp): Boolean = { // Note: TimestampNTZType is not supported because Comet incorrectly applies // timezone conversion to TimestampNTZ values. TimestampNTZ stores local time @@ -392,6 +405,12 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] { val supportedFormats: Seq[String] = Seq("year", "yyyy", "yy", "quarter", "mon", "month", "mm", "week") + override def getIncompatibleReasons(): Seq[String] = Seq( + "Non-literal format strings will throw an exception instead of returning NULL") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only the following formats are supported: " + supportedFormats.mkString(", ")) + override def getSupportLevel(expr: TruncDate): SupportLevel = { expr.format match { case Literal(fmt: UTF8String, _) => diff --git a/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala b/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala index 880f01742b..52adf950c1 100644 --- a/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala +++ b/spark/src/main/scala/org/apache/comet/serde/decimalExpressions.scala @@ -39,6 +39,8 @@ object CometUnscaledValue extends CometExpressionSerde[UnscaledValue] { object CometMakeDecimal extends CometExpressionSerde[MakeDecimal] { + override def getUnsupportedReasons(): Seq[String] = Seq("Only `LongType` input is supported") + override def getSupportLevel(expr: MakeDecimal): SupportLevel = { expr.child.dataType match { case LongType => Compatible() diff --git a/spark/src/main/scala/org/apache/comet/serde/literals.scala b/spark/src/main/scala/org/apache/comet/serde/literals.scala index e24b55449c..c81b146a89 100644 --- a/spark/src/main/scala/org/apache/comet/serde/literals.scala +++ b/spark/src/main/scala/org/apache/comet/serde/literals.scala @@ -37,6 +37,9 @@ import org.apache.comet.serde.Types.ListLiteral object CometLiteral extends CometExpressionSerde[Literal] with Logging { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Not all data types are supported for literal values") + override def getSupportLevel(expr: Literal): SupportLevel = { if (supportedDataType( diff --git a/spark/src/main/scala/org/apache/comet/serde/maps.scala b/spark/src/main/scala/org/apache/comet/serde/maps.scala index ceafc157c4..1d2fcf9366 100644 --- a/spark/src/main/scala/org/apache/comet/serde/maps.scala +++ b/spark/src/main/scala/org/apache/comet/serde/maps.scala @@ -137,6 +137,9 @@ object CometMapFromEntries extends CometScalarFunction[MapFromEntries]("map_from val keyUnsupportedReason = "Using BinaryType as Map keys is not allowed in map_from_entries" val valueUnsupportedReason = "Using BinaryType as Map values is not allowed in map_from_entries" + override def getIncompatibleReasons(): Seq[String] = + Seq(keyUnsupportedReason, valueUnsupportedReason) + private def containsBinary(dataType: DataType): Boolean = { dataType match { case BinaryType => true diff --git a/spark/src/main/scala/org/apache/comet/serde/math.scala b/spark/src/main/scala/org/apache/comet/serde/math.scala index 2f3a6902d6..401d14cc78 100644 --- a/spark/src/main/scala/org/apache/comet/serde/math.scala +++ b/spark/src/main/scala/org/apache/comet/serde/math.scala @@ -205,6 +205,8 @@ sealed trait MathExprBase { object CometCheckOverflow extends CometExpressionSerde[CheckOverflow] { + override def getUnsupportedReasons(): Seq[String] = Seq("Only `DecimalType` is supported") + override def getSupportLevel(expr: CheckOverflow): SupportLevel = { if (expr.dataType.isInstanceOf[DecimalType]) { Compatible() diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 50621fc389..968fe8cd69 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -33,6 +33,10 @@ import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInter object CometStringRepeat extends CometExpressionSerde[StringRepeat] { + override def getCompatibleNotes(): Seq[String] = Seq( + "A negative argument for the number of times to repeat throws an exception" + + " instead of returning an empty string as Spark does") + override def convert( expr: StringRepeat, inputs: Seq[Attribute], @@ -50,6 +54,10 @@ object CometStringRepeat extends CometExpressionSerde[StringRepeat] { class CometCaseConversionBase[T <: Expression](function: String) extends CometScalarFunction[T](function) { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Results can vary depending on locale and character set." + + s" Requires `${CometConf.COMET_CASE_CONVERSION_ENABLED.key}=true` to enable.") + override def convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { if (!CometConf.COMET_CASE_CONVERSION_ENABLED.get()) { withInfo( @@ -68,6 +76,8 @@ object CometUpper extends CometCaseConversionBase[Upper]("upper") object CometLower extends CometCaseConversionBase[Lower]("lower") object CometLength extends CometScalarFunction[Length]("length") { + override def getUnsupportedReasons(): Seq[String] = Seq("`BinaryType` input is not supported") + override def getSupportLevel(expr: Length): SupportLevel = expr.child.dataType match { case _: BinaryType => Unsupported(Some("Length on BinaryType is not supported")) case _ => Compatible() @@ -76,6 +86,11 @@ object CometLength extends CometScalarFunction[Length]("length") { object CometInitCap extends CometScalarFunction[InitCap]("initcap") { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Treats hyphen as a word separator (e.g. `robert rose-smith` produces `Robert Rose-Smith`" + + " instead of Spark's `Robert Rose-smith`)" + + " (https://github.com/apache/datafusion-comet/issues/1052)") + override def getSupportLevel(expr: InitCap): SupportLevel = { // Behavior differs from Spark. One example is that for the input "robert rose-smith", Spark // will produce "Robert Rose-smith", but Comet will produce "Robert Rose-Smith". @@ -116,6 +131,10 @@ object CometSubstring extends CometExpressionSerde[Substring] { object CometLeft extends CometExpressionSerde[Left] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `BinaryType` and `StringType` input", + "The length argument must be a literal value") + override def convert(expr: Left, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { expr.len match { case Literal(lenValue, _) => @@ -179,6 +198,8 @@ object CometRight extends CometExpressionSerde[Right] { } } + override def getUnsupportedReasons(): Seq[String] = Seq("Only supports `StringType` input") + override def getSupportLevel(expr: Right): SupportLevel = { expr.str.dataType match { case _: StringType => Compatible() @@ -190,6 +211,8 @@ object CometRight extends CometExpressionSerde[Right] { object CometConcat extends CometScalarFunction[Concat]("concat") { val unsupportedReason = "CONCAT supports only string input parameters" + override def getIncompatibleReasons(): Seq[String] = Seq(unsupportedReason) + override def getSupportLevel(expr: Concat): SupportLevel = { if (expr.children.forall(_.dataType == DataTypes.StringType)) { Compatible() @@ -240,6 +263,9 @@ object CometLike extends CometExpressionSerde[Like] { object CometRLike extends CometExpressionSerde[RLike] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Uses Rust regexp engine, which has different behavior to Java regexp engine") + override def convert(expr: RLike, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { expr.right match { case Literal(pattern, DataTypes.StringType) => @@ -269,6 +295,10 @@ object CometRLike extends CometExpressionSerde[RLike] { object CometStringRPad extends CometExpressionSerde[StringRPad] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Scalar values are not supported for the `str` argument." + + " Only scalar values are supported for the `pad` argument.") + override def getSupportLevel(expr: StringRPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { return Unsupported(Some("Scalar values are not supported for the str argument")) @@ -294,6 +324,10 @@ object CometStringRPad extends CometExpressionSerde[StringRPad] { object CometStringLPad extends CometExpressionSerde[StringLPad] { + override def getUnsupportedReasons(): Seq[String] = Seq( + "Scalar values are not supported for the `str` argument." + + " Only scalar values are supported for the `pad` argument.") + override def getSupportLevel(expr: StringLPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { return Unsupported(Some("Scalar values are not supported for the str argument")) @@ -317,6 +351,12 @@ object CometStringLPad extends CometExpressionSerde[StringLPad] { } object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Regexp pattern may not be compatible with Spark") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only supports `regexp_replace` with an offset of 1 (no offset)") + override def getSupportLevel(expr: RegExpReplace): SupportLevel = { if (!RegExp.isSupportedPattern(expr.regexp.toString) && !CometConf.isExprAllowIncompat("regexp")) { @@ -361,6 +401,9 @@ object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { */ object CometStringSplit extends CometExpressionSerde[StringSplit] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Regex engine differences between Java and Rust") + override def getSupportLevel(expr: StringSplit): SupportLevel = Incompatible(Some("Regex engine differences between Java and Rust")) @@ -384,6 +427,10 @@ object CometStringSplit extends CometExpressionSerde[StringSplit] { object CometGetJsonObject extends CometExpressionSerde[GetJsonObject] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Spark allows single-quoted JSON and unescaped control characters which Comet does not" + + " support") + override def getSupportLevel(expr: GetJsonObject): SupportLevel = Incompatible( Some( diff --git a/spark/src/main/scala/org/apache/comet/serde/structs.scala b/spark/src/main/scala/org/apache/comet/serde/structs.scala index a9fffcf6f7..688d27f91f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/structs.scala +++ b/spark/src/main/scala/org/apache/comet/serde/structs.scala @@ -181,6 +181,11 @@ object CometStructsToJson extends CometExpressionSerde[StructsToJson] { object CometJsonToStructs extends CometExpressionSerde[JsonToStructs] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Partially implemented and not comprehensively tested") + + override def getUnsupportedReasons(): Seq[String] = Seq("Requires an explicit schema") + override def getSupportLevel(expr: JsonToStructs): SupportLevel = { // this feature is partially implemented and not comprehensively tested yet Incompatible() @@ -247,6 +252,13 @@ object CometStructsToCsv extends CometExpressionSerde[StructsToCsv] { private val incompatibleDataTypes = Seq(DateType, TimestampType, TimestampNTZType, BinaryType) + override def getIncompatibleReasons(): Seq[String] = Seq( + "Date, Timestamp, TimestampNTZ, and Binary data types may produce different results" + + " (https://github.com/apache/datafusion-comet/issues/3232)") + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Complex types (arrays, maps, structs) in the schema are not supported") + override def getSupportLevel(expr: StructsToCsv): SupportLevel = { val dataTypes = expr.inputSchema.fields.map(_.dataType) val containsComplexType = dataTypes.exists(DataTypeSupport.isComplexType) diff --git a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala index 198c7d3101..e5eeb5b848 100644 --- a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala @@ -29,6 +29,11 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn // https://github.com/apache/datafusion/issues/16594 object CometFromUnixTime extends CometExpressionSerde[FromUnixTime] { + override def getIncompatibleReasons(): Seq[String] = Seq( + "Only supports the default datetime format pattern `yyyy-MM-dd HH:mm:ss`." + + " DataFusion's valid timestamp range differs from Spark" + + " (https://github.com/apache/datafusion/issues/16594)") + override def getSupportLevel(expr: FromUnixTime): SupportLevel = Incompatible(None) override def convert(