diff --git a/datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap b/datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap index c01699146aa8c..8742d4c04f15d 100644 --- a/datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap +++ b/datafusion-cli/tests/snapshots/cli_quick_test@backslash.snap @@ -12,6 +12,6 @@ info: success: true exit_code: 0 ----- stdout ----- -[{"Utf8(\"\\\")":"\\","Utf8(\"\\\\\")":"\\\\","Utf8(\"\\\\\\\\\\\")":"\\\\\\\\\\","Utf8(\"dsdsds\\\\\\\\\")":"dsdsds\\\\\\\\","Utf8(\"\\t\")":"\\t","Utf8(\"\\0\")":"\\0","Utf8(\"\\n\")":"\\n"}] +[{"Utf8(\"\\\")":"\\","Utf8(\"\\\\\")":"\\\\","Utf8(\"\\\\\\\\\")":"\\\\\\\\","Utf8(\"dsdsds\\\\\\\\\")":"dsdsds\\\\\\\\","Utf8(\"\\t\")":"\\t","Utf8(\"\\0\")":"\\0","Utf8(\"\\n\")":"\\n"}] ----- stderr ----- diff --git a/datafusion-cli/tests/sql/backslash.sql b/datafusion-cli/tests/sql/backslash.sql index f2fe2b03746ea..34823a1aa083d 100644 --- a/datafusion-cli/tests/sql/backslash.sql +++ b/datafusion-cli/tests/sql/backslash.sql @@ -1 +1 @@ -select '\', '\\', '\\\\\', 'dsdsds\\\\', '\t', '\0', '\n'; \ No newline at end of file +select '\', '\\', '\\\\', 'dsdsds\\\\', '\t', '\0', '\n' \ No newline at end of file diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 53f2501d60752..424e2fa3d5f6c 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -294,6 +294,11 @@ config_namespace! { /// Default is true. pub map_string_types_to_utf8view: bool, default = true + /// When set to true, SQL string literals use Spark-compatible backslash + /// escape handling during SQL planning. This should only be enabled for + /// Spark compatibility mode. + pub spark_string_literal_unescape: bool, default = false + /// When set to true, the source locations relative to the original SQL /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected /// and recorded in the logical plan nodes. diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index f0888e01049ad..a428dcbf36d3b 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -534,6 +534,8 @@ impl SessionState { .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, map_string_types_to_utf8view: sql_parser_options.map_string_types_to_utf8view, + spark_string_literal_unescape: sql_parser_options + .spark_string_literal_unescape, collect_spans: sql_parser_options.collect_spans, default_null_ordering: sql_parser_options .default_null_ordering diff --git a/datafusion/spark/src/session_state.rs b/datafusion/spark/src/session_state.rs index e39de3a5888ea..f735c3e9e7464 100644 --- a/datafusion/spark/src/session_state.rs +++ b/datafusion/spark/src/session_state.rs @@ -81,6 +81,12 @@ impl SessionStateBuilderSpark for SessionStateBuilder { .map(|f| (f.name().to_string(), f)), ); + self.config() + .get_or_insert_with(Default::default) + .options_mut() + .sql_parser + .spark_string_literal_unescape = true; + self } } diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs index bd75ac36306fb..4ae41ba8ad459 100644 --- a/datafusion/sql/src/expr/value.rs +++ b/datafusion/sql/src/expr/value.rs @@ -48,8 +48,14 @@ impl SqlToRel<'_, S> { param_data_types: &[FieldRef], ) -> Result { match value { - Value::Number(n, _) => self.parse_sql_number(&n, false), - Value::SingleQuotedString(s) | Value::DoubleQuotedString(s) => Ok(lit(s)), + Value::Number(n,_) => self.parse_sql_number(&n, false), + Value::SingleQuotedString(s) | Value::DoubleQuotedString(s) => { + if self.options.spark_string_literal_unescape { + Ok(lit(unescape_string_literal(&s)?)) + } else { + Ok(lit(s)) + } + } Value::Null => Ok(Expr::Literal(ScalarValue::Null, None)), Value::Boolean(n) => Ok(lit(n)), Value::Placeholder(param) => { @@ -63,7 +69,13 @@ impl SqlToRel<'_, S> { } } Value::DollarQuotedString(s) => Ok(lit(s.value)), - Value::EscapedStringLiteral(s) => Ok(lit(s)), + Value::EscapedStringLiteral(s) => { + if self.options.spark_string_literal_unescape { + Ok(lit(unescape_string_literal(&s)?)) + } else { + Ok(lit(s)) + } + } _ => plan_err!("Unsupported Value '{value:?}'"), } } @@ -305,6 +317,62 @@ fn interval_literal(interval_value: SQLExpr, negative: bool) -> Result { if negative { Ok(format!("-{s}")) } else { Ok(s) } } +fn unescape_string_literal(s: &str) -> Result { + let mut out = String::with_capacity(s.len()); + let mut chars = s.chars().peekable(); + + while let Some(ch) = chars.next() { + if ch != '\\' { + out.push(ch); + continue; + } + + let Some(next) = chars.next() else { + out.push('\\'); + break; + }; + + match next { + '0' => out.push('\0'), + 'b' => out.push('\u{0008}'), + 'n' => out.push('\n'), + 'r' => out.push('\r'), + 't' => out.push('\t'), + 'Z' => out.push('\u{001A}'), + '\\' => out.push('\\'), + '\'' => out.push('\''), + '"' => out.push('"'), + '%' => out.push('%'), + '_' => out.push('_'), + + '0'..='7' => { + let mut octal = String::new(); + octal.push(next); + + for _ in 0..2 { + match chars.peek() { + Some('0'..='7') => { + octal.push(chars.next().unwrap()); + } + _ => break, + } + } + + let value = u8::from_str_radix(&octal, 8).map_err(|_| { + DataFusionError::from(ParserError(format!( + "Invalid octal escape sequence: \\{octal}" + ))) + })?; + out.push(value as char); + } + + other => out.push(other), + } + } + + Ok(out) +} + /// Try to decode bytes from hex literal string. /// /// None will be returned if the input literal is hex-invalid. @@ -422,6 +490,34 @@ fn parse_decimal(unsigned_number: &str, negative: bool) -> Result { mod tests { use super::*; + #[test] + fn test_unescape_string_literal_basic_escapes() { + assert_eq!(unescape_string_literal(r"\t hello").unwrap(), "\t hello"); + assert_eq!(unescape_string_literal(r"\n hello").unwrap(), "\n hello"); + assert_eq!(unescape_string_literal(r"\r hello").unwrap(), "\r hello"); + assert_eq!(unescape_string_literal(r"\\").unwrap(), "\\"); + assert_eq!(unescape_string_literal(r"it\'s").unwrap(), "it's"); + assert_eq!(unescape_string_literal(r#"a\"b"#).unwrap(), "a\"b"); + } + + #[test] + fn test_unescape_string_literal_octal() { + assert_eq!(unescape_string_literal(r"\101").unwrap(), "A"); + assert_eq!(unescape_string_literal(r"\141").unwrap(), "a"); + assert_eq!(unescape_string_literal(r"\7").unwrap(), "\x07"); + } + + #[test] + fn test_unescape_string_literal_unknown_escape() { + assert_eq!(unescape_string_literal(r"\x").unwrap(), "x"); + assert_eq!(unescape_string_literal(r"abc\qdef").unwrap(), "abcqdef"); + } + + #[test] + fn test_unescape_string_literal_trailing_backslash() { + assert_eq!(unescape_string_literal("abc\\").unwrap(), "abc\\"); + } + #[test] fn test_decode_hex_literal() { let cases = [ @@ -517,4 +613,4 @@ mod tests { "This feature is not implemented: Decimal precision 77 exceeds the maximum supported precision: 76" ); } -} +} \ No newline at end of file diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 1ecf90b7947c3..22c84644862a5 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -2203,4 +2203,4 @@ mod tests { "Expected: end of expression, found: bar", ) } -} +} \ No newline at end of file diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 32daf65a71fa4..635ccc5a1b9ab 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -55,6 +55,8 @@ pub struct ParserOptions { pub collect_spans: bool, /// Whether string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. pub map_string_types_to_utf8view: bool, + /// Whether to use Spark-compatible string literal unescaping. + pub spark_string_literal_unescape: bool, /// Default null ordering for sorting expressions. pub default_null_ordering: NullOrdering, } @@ -78,6 +80,7 @@ impl ParserOptions { map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + spark_string_literal_unescape: false, // By default, `nulls_max` is used to follow Postgres's behavior. // postgres rule: https://www.postgresql.org/docs/current/queries-order.html default_null_ordering: NullOrdering::NullsMax, @@ -124,6 +127,12 @@ impl ParserOptions { self } + /// Sets the spark_string_literal_unescape option. + pub fn with_spark_string_literal_unescape(mut self, value: bool) -> Self { + self.spark_string_literal_unescape = value; + self + } + /// Sets the `enable_options_value_normalization` option. pub fn with_enable_options_value_normalization(mut self, value: bool) -> Self { self.enable_options_value_normalization = value; @@ -159,6 +168,7 @@ impl From<&SqlParserOptions> for ParserOptions { enable_options_value_normalization: options .enable_options_value_normalization, collect_spans: options.collect_spans, + spark_string_literal_unescape: options.spark_string_literal_unescape, default_null_ordering: options.default_null_ordering.as_str().into(), } } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 2c2c7eac8bfc4..5f1b2602a5bf8 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3571,6 +3571,7 @@ fn parse_decimals_parser_options() -> ParserOptions { map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + spark_string_literal_unescape: false, default_null_ordering: NullOrdering::NullsMax, } } @@ -3583,6 +3584,7 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + spark_string_literal_unescape: false, default_null_ordering: NullOrdering::NullsMax, } } @@ -3595,6 +3597,7 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions { map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, + spark_string_literal_unescape: false, default_null_ordering: NullOrdering::NullsMax, } } diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 0b34f381cbc59..51a8f892f1dae 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -344,6 +344,7 @@ datafusion.sql_parser.enable_options_value_normalization false datafusion.sql_parser.map_string_types_to_utf8view true datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.recursion_limit 50 +datafusion.sql_parser.spark_string_literal_unescape false datafusion.sql_parser.support_varchar_with_length true # show all variables with verbose @@ -488,6 +489,7 @@ datafusion.sql_parser.enable_options_value_normalization false When set to true, datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries +datafusion.sql_parser.spark_string_literal_unescape false When set to true, SQL string literals use Spark-compatible backslash escape handling during SQL planning. This should only be enabled for Spark compatibility mode. datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. # show_variable_in_config_options diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt index 79dca1c10a7d0..a4a5ba1cc5fe2 100644 --- a/datafusion/sqllogictest/test_files/spark/array/array.slt +++ b/datafusion/sqllogictest/test_files/spark/array/array.slt @@ -50,10 +50,10 @@ SELECT array(1, NULL, 3); [1, NULL, 3] -query ? -SELECT array['hello', '', null, 'nULl', 'nULlx', 'aa"bb', 'mm\nn', 'uu,vv', 'yy zz']; +query I +SELECT ascii(substr(array['hello', '', null, 'nULl', 'nULlx', 'aa"bb', 'mm\nn', 'uu,vv', 'yy zz'][7], 3 ,1)); ---- -[hello, , NULL, nULl, nULlx, aa"bb, mm\nn, uu,vv, yy zz] +10 query ? SELECT array(array(1,2),array(3,4)); diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index ec85c4bd40b24..0d8f48e811457 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -80,10 +80,10 @@ SELECT soundex(' hello'); ---- hello -query T -SELECT soundex('\thello'); +query I +SELECT ascii(substr(soundex('\thello'),1 ,1)); ---- -\thello +9 query T SELECT soundex('đŸ˜€hello'); @@ -190,10 +190,10 @@ SELECT soundex('#'); ---- # -query T -SELECT soundex('\nhello'); +query I +SELECT ascii(substr(soundex('\nhello'),1 ,1)); ---- -\nhello +10 query T SELECT concat(soundex(' '), 'Spark')