From 2e5d69832b22d79f5b3f849a06940c3abf7db1e2 Mon Sep 17 00:00:00 2001 From: Vitalii Li Date: Tue, 23 Aug 2022 15:15:21 -0700 Subject: [PATCH 1/7] [SPARK-40194][SQL] SPLIT function on empty regex should truncate trailing empty string. --- .../apache/spark/unsafe/types/UTF8String.java | 11 +++ .../spark/unsafe/types/UTF8StringSuite.java | 9 +++ .../expressions/regexpExpressions.scala | 2 +- .../expressions/RegexpExpressionsSuite.scala | 9 +++ .../sql-tests/inputs/regexp-functions.sql | 11 +++ .../results/regexp-functions.sql.out | 72 +++++++++++++++++++ 6 files changed, 113 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c81dc10f8f5eb..8c57f916c2fa9 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1000,6 +1000,17 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { } public UTF8String[] split(UTF8String pattern, int limit) { + // This is a special case for converting string into array of symbols without a trailing empty + // string. E.g. `"hello".split("", 0) => ["h", "e", "l", "l", "o"]. + // Note that negative limit will preserve a trailing empty string. + if (limit == 0 && numBytes() != 0 && pattern.numBytes() == 0) { + byte[] input = getBytes(); + UTF8String[] result = new UTF8String[numBytes]; + for (int i = 0; i < numBytes; i++) { + result[i] = UTF8String.fromBytes(input, i, 1); + } + return result; + } return split(pattern.toString(), limit); } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index f530c81df269e..ef863005e0dc0 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -409,6 +409,15 @@ public void split() { assertArrayEquals( new UTF8String[]{fromString("ab"), fromString("def,ghi,")}, fromString("ab,def,ghi,").split(fromString(","), 2)); + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b")}, + fromString("ab").split(fromString(""), 0)); + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b"), fromString("")}, + fromString("ab").split(fromString(""), -1)); + assertArrayEquals( + new UTF8String[]{fromString("")}, + fromString("").split(fromString(""), 0)); } @Test diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 4e474cbf8cd37..e80bd6839cc8d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -527,7 +527,7 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) override def second: Expression = regex override def third: Expression = limit - def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)); + def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(0)) override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { val strings = string.asInstanceOf[UTF8String].split( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 91b0f0e1039f6..5e8ae558b0cbb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -461,6 +461,15 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1) checkEvaluation(StringSplit(s1, s2, -1), null, row2) checkEvaluation(StringSplit(s1, s2, -1), null, row3) + // Empty regex + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 0), Seq("h", "e", "l", "l", "o"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), -1), Seq("h", "e", "l", "l", "o", ""), row1) + checkEvaluation( + StringSplit(Literal(""), Literal(""), -1), Seq(""), row1) + checkEvaluation( + StringSplit(Literal(""), Literal(""), 0), Seq(""), row1) // Test escaping of arguments GenerateUnsafeProjection.generate( diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql index b6d5724343456..6b3eeb93d0b36 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -80,3 +80,14 @@ SELECT regexp_instr('1a 2b 14m', '\\d{2}(a|b|m)'); SELECT regexp_instr('abc', null); SELECT regexp_instr(null, 'b'); SELECT regexp_instr('abc', col0, 1) FROM VALUES(') ?') AS t(col0); + +--split +SELECT split('aa2bb3cc', '[1-9]+'); +SELECT split('aa2bb3cc', '[1-9]+', 2); +SELECT split('aacbbcddc', 'c', 0); +SELECT split('aacbbcddc', 'c', -1); +SELECT split('hello', ''); +SELECT split('hello', '', -1); +SELECT split('', ''); +SELECT split('abc', null); +SELECT split(null, 'b'); diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out index 65e1e31ae7cf3..5fb796d12e061 100644 --- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -590,3 +590,75 @@ org.apache.spark.SparkRuntimeException "expected" : ") ?" } } + + +-- !query +SELECT split('aa2bb3cc', '[1-9]+') +-- !query schema +struct> +-- !query output +["aa","bb","cc"] + + +-- !query +SELECT split('aa2bb3cc', '[1-9]+', 2) +-- !query schema +struct> +-- !query output +["aa","bb3cc"] + + +-- !query +SELECT split('aacbbcddc', 'c', 0) +-- !query schema +struct> +-- !query output +["aa","bb","dd",""] + + +-- !query +SELECT split('aacbbcddc', 'c', -1) +-- !query schema +struct> +-- !query output +["aa","bb","dd",""] + + +-- !query +SELECT split('hello', '') +-- !query schema +struct> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('hello', '', -1) +-- !query schema +struct> +-- !query output +["h","e","l","l","o",""] + + +-- !query +SELECT split('', '') +-- !query schema +struct> +-- !query output +[""] + + +-- !query +SELECT split('abc', null) +-- !query schema +struct> +-- !query output +NULL + + +-- !query +SELECT split(null, 'b') +-- !query schema +struct> +-- !query output +NULL From 885106c9efa41f1759b8620d4bda2083012b52e6 Mon Sep 17 00:00:00 2001 From: Vitalii Li Date: Tue, 23 Aug 2022 15:24:43 -0700 Subject: [PATCH 2/7] Updated sql-migration-guide.md --- docs/sql-migration-guide.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 42df05f7f7068..5f79fe18c3c45 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -27,6 +27,7 @@ license: | - Since Spark 3.4, Number or Number(\*) from Teradata will be treated as Decimal(38,18). In Spark 3.3 or earlier, Number or Number(\*) from Teradata will be treated as Decimal(38, 0), in which case the fractional part will be removed. - Since Spark 3.4, v1 database, table, permanent view and function identifier will include 'spark_catalog' as the catalog name if database is defined, e.g. a table identifier will be: `spark_catalog.default.t`. To restore the legacy behavior, set `spark.sql.legacy.v1IdentifierNoCatalog` to `true`. - Since Spark 3.4, when ANSI SQL mode(configuration `spark.sql.ansi.enabled`) is on, Spark SQL always returns NULL result on getting a map value with a non-existing key. In Spark 3.3 or earlier, there will be an error. + - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty and `limit` parameter is not specified or set to 0. ## Upgrading from Spark SQL 3.2 to 3.3 From 7a67a418509069ebe66629f011bc405ff9b09246 Mon Sep 17 00:00:00 2001 From: Vitalii Li Date: Tue, 23 Aug 2022 18:12:35 -0700 Subject: [PATCH 3/7] Some test fixes --- .../sql-functions/sql-expression-schema.md | 2 +- .../sql-tests/inputs/regexp-functions.sql | 11 --- .../sql-tests/inputs/string-functions.sql | 5 ++ .../ansi/higher-order-functions.sql.out | 4 +- .../results/ansi/string-functions.sql.out | 42 ++++++++++- .../sql-tests/results/charvarchar.sql.out | 2 +- .../results/higher-order-functions.sql.out | 4 +- .../results/regexp-functions.sql.out | 72 ------------------- .../results/string-functions.sql.out | 42 ++++++++++- 9 files changed, 93 insertions(+), 91 deletions(-) diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index bc7941f659f24..c36791a046ab9 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -287,7 +287,7 @@ | org.apache.spark.sql.catalyst.expressions.StringRepeat | repeat | SELECT repeat('123', 2) | struct | | org.apache.spark.sql.catalyst.expressions.StringReplace | replace | SELECT replace('ABCabc', 'abc', 'DEF') | struct | | org.apache.spark.sql.catalyst.expressions.StringSpace | space | SELECT concat(space(2), '1') | struct | -| org.apache.spark.sql.catalyst.expressions.StringSplit | split | SELECT split('oneAtwoBthreeC', '[ABC]') | struct> | +| org.apache.spark.sql.catalyst.expressions.StringSplit | split | SELECT split('oneAtwoBthreeC', '[ABC]') | struct> | | org.apache.spark.sql.catalyst.expressions.StringToMap | str_to_map | SELECT str_to_map('a:1,b:2,c:3', ',', ':') | struct> | | org.apache.spark.sql.catalyst.expressions.StringTranslate | translate | SELECT translate('AaBbCc', 'abc', '123') | struct | | org.apache.spark.sql.catalyst.expressions.StringTrim | trim | SELECT trim(' SparkSQL ') | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql index 6b3eeb93d0b36..b6d5724343456 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -80,14 +80,3 @@ SELECT regexp_instr('1a 2b 14m', '\\d{2}(a|b|m)'); SELECT regexp_instr('abc', null); SELECT regexp_instr(null, 'b'); SELECT regexp_instr('abc', col0, 1) FROM VALUES(') ?') AS t(col0); - ---split -SELECT split('aa2bb3cc', '[1-9]+'); -SELECT split('aa2bb3cc', '[1-9]+', 2); -SELECT split('aacbbcddc', 'c', 0); -SELECT split('aacbbcddc', 'c', -1); -SELECT split('hello', ''); -SELECT split('hello', '', -1); -SELECT split('', ''); -SELECT split('abc', null); -SELECT split(null, 'b'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 058ea89179786..99429b800712c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -26,6 +26,11 @@ select right("abcd", -2), right("abcd", 0), right("abcd", 'a'); -- split function SELECT split('aa1cc2ee3', '[1-9]+'); SELECT split('aa1cc2ee3', '[1-9]+', 2); +SELECT split('hello', ''); +SELECT split('hello', '', -1); +SELECT split('', ''); +SELECT split('abc', null); +SELECT split(null, 'b'); -- split_part function SELECT split_part('11.12.13', '.', 2); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out index c6bbb4fb7179a..f7a6fa585a253 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out @@ -272,6 +272,6 @@ struct> -- !query select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array(x))) -- !query schema -struct>> +struct>> -- !query output -[[""]] +[["h"]] diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index add89a635a83f..d114eca13a765 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -142,7 +142,7 @@ org.apache.spark.SparkNumberFormatException -- !query SELECT split('aa1cc2ee3', '[1-9]+') -- !query schema -struct> +struct> -- !query output ["aa","cc","ee",""] @@ -155,6 +155,46 @@ struct> ["aa","cc2ee3"] +-- !query +SELECT split('hello', '') +-- !query schema +struct> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('hello', '', -1) +-- !query schema +struct> +-- !query output +["h","e","l","l","o",""] + + +-- !query +SELECT split('', '') +-- !query schema +struct> +-- !query output +[""] + + +-- !query +SELECT split('abc', null) +-- !query schema +struct> +-- !query output +NULL + + +-- !query +SELECT split(null, 'b') +-- !query schema +struct> +-- !query output +NULL + + -- !query SELECT split_part('11.12.13', '.', 2) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out index 1f79817787a43..ee5060f5adf7e 100644 --- a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out @@ -1054,7 +1054,7 @@ NetEase NetEase Spark- NetEase -- !query select split(c7, 'e'), split(c8, 'e'), split(v, 'a'), split(s, 'e') from char_tbl4 -- !query schema -struct,split(c8, e, -1):array,split(v, a, -1):array,split(s, e, -1):array> +struct,split(c8, e, 0):array,split(v, a, 0):array,split(s, e, 0):array> -- !query output NULL NULL NULL NULL NULL NULL ["S"] NULL diff --git a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out index c6bbb4fb7179a..f7a6fa585a253 100644 --- a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out @@ -272,6 +272,6 @@ struct> -- !query select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array(x))) -- !query schema -struct>> +struct>> -- !query output -[[""]] +[["h"]] diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out index 5fb796d12e061..65e1e31ae7cf3 100644 --- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -590,75 +590,3 @@ org.apache.spark.SparkRuntimeException "expected" : ") ?" } } - - --- !query -SELECT split('aa2bb3cc', '[1-9]+') --- !query schema -struct> --- !query output -["aa","bb","cc"] - - --- !query -SELECT split('aa2bb3cc', '[1-9]+', 2) --- !query schema -struct> --- !query output -["aa","bb3cc"] - - --- !query -SELECT split('aacbbcddc', 'c', 0) --- !query schema -struct> --- !query output -["aa","bb","dd",""] - - --- !query -SELECT split('aacbbcddc', 'c', -1) --- !query schema -struct> --- !query output -["aa","bb","dd",""] - - --- !query -SELECT split('hello', '') --- !query schema -struct> --- !query output -["h","e","l","l","o"] - - --- !query -SELECT split('hello', '', -1) --- !query schema -struct> --- !query output -["h","e","l","l","o",""] - - --- !query -SELECT split('', '') --- !query schema -struct> --- !query output -[""] - - --- !query -SELECT split('abc', null) --- !query schema -struct> --- !query output -NULL - - --- !query -SELECT split(null, 'b') --- !query schema -struct> --- !query output -NULL diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index dedbd29d4bba1..0ec4f3645fb95 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -108,7 +108,7 @@ struct -- !query SELECT split('aa1cc2ee3', '[1-9]+') -- !query schema -struct> +struct> -- !query output ["aa","cc","ee",""] @@ -121,6 +121,46 @@ struct> ["aa","cc2ee3"] +-- !query +SELECT split('hello', '') +-- !query schema +struct> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('hello', '', -1) +-- !query schema +struct> +-- !query output +["h","e","l","l","o",""] + + +-- !query +SELECT split('', '') +-- !query schema +struct> +-- !query output +[""] + + +-- !query +SELECT split('abc', null) +-- !query schema +struct> +-- !query output +NULL + + +-- !query +SELECT split(null, 'b') +-- !query schema +struct> +-- !query output +NULL + + -- !query SELECT split_part('11.12.13', '.', 2) -- !query schema From 34c6b47791e981b52aa4647c80da529980d0d3f3 Mon Sep 17 00:00:00 2001 From: Vitalii Li Date: Wed, 24 Aug 2022 09:01:59 -0700 Subject: [PATCH 4/7] Revert default limit and fix UTF8 char point size --- .../apache/spark/unsafe/types/UTF8String.java | 12 ++++++++---- .../spark/unsafe/types/UTF8StringSuite.java | 2 +- docs/sql-migration-guide.md | 2 +- .../expressions/regexpExpressions.scala | 2 +- .../expressions/RegexpExpressionsSuite.scala | 2 +- .../sql-functions/sql-expression-schema.md | 2 +- .../sql-tests/inputs/string-functions.sql | 1 - .../ansi/higher-order-functions.sql.out | 2 +- .../results/ansi/string-functions.sql.out | 18 +++++------------- .../sql-tests/results/charvarchar.sql.out | 2 +- .../results/higher-order-functions.sql.out | 2 +- .../sql-tests/results/string-functions.sql.out | 18 +++++------------- 12 files changed, 26 insertions(+), 39 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 8c57f916c2fa9..7a16b53974b07 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1003,11 +1003,15 @@ public UTF8String[] split(UTF8String pattern, int limit) { // This is a special case for converting string into array of symbols without a trailing empty // string. E.g. `"hello".split("", 0) => ["h", "e", "l", "l", "o"]. // Note that negative limit will preserve a trailing empty string. - if (limit == 0 && numBytes() != 0 && pattern.numBytes() == 0) { + if (limit <= 0 && numBytes() != 0 && pattern.numBytes() == 0) { byte[] input = getBytes(); - UTF8String[] result = new UTF8String[numBytes]; - for (int i = 0; i < numBytes; i++) { - result[i] = UTF8String.fromBytes(input, i, 1); + int byteIndex = 0; + int charIndex = 0; + UTF8String[] result = new UTF8String[numChars()]; + while (byteIndex < numBytes) { + int currCharNumBytes = numBytesForFirstByte(input[byteIndex]); + result[charIndex++] = UTF8String.fromBytes(input, byteIndex, currCharNumBytes); + byteIndex += currCharNumBytes; } return result; } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index ef863005e0dc0..f42bd490b1829 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -413,7 +413,7 @@ public void split() { new UTF8String[]{fromString("a"), fromString("b")}, fromString("ab").split(fromString(""), 0)); assertArrayEquals( - new UTF8String[]{fromString("a"), fromString("b"), fromString("")}, + new UTF8String[]{fromString("a"), fromString("b")}, fromString("ab").split(fromString(""), -1)); assertArrayEquals( new UTF8String[]{fromString("")}, diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 5f79fe18c3c45..53aa690646f12 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -27,7 +27,7 @@ license: | - Since Spark 3.4, Number or Number(\*) from Teradata will be treated as Decimal(38,18). In Spark 3.3 or earlier, Number or Number(\*) from Teradata will be treated as Decimal(38, 0), in which case the fractional part will be removed. - Since Spark 3.4, v1 database, table, permanent view and function identifier will include 'spark_catalog' as the catalog name if database is defined, e.g. a table identifier will be: `spark_catalog.default.t`. To restore the legacy behavior, set `spark.sql.legacy.v1IdentifierNoCatalog` to `true`. - Since Spark 3.4, when ANSI SQL mode(configuration `spark.sql.ansi.enabled`) is on, Spark SQL always returns NULL result on getting a map value with a non-existing key. In Spark 3.3 or earlier, there will be an error. - - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty and `limit` parameter is not specified or set to 0. + - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty and `limit` parameter is not specified or set to 0 or -1. ## Upgrading from Spark SQL 3.2 to 3.3 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index e80bd6839cc8d..500c040dfe4e9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -527,7 +527,7 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) override def second: Expression = regex override def third: Expression = limit - def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(0)) + def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)) override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { val strings = string.asInstanceOf[UTF8String].split( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 5e8ae558b0cbb..7c756daba42c0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -465,7 +465,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( StringSplit(Literal("hello"), Literal(""), 0), Seq("h", "e", "l", "l", "o"), row1) checkEvaluation( - StringSplit(Literal("hello"), Literal(""), -1), Seq("h", "e", "l", "l", "o", ""), row1) + StringSplit(Literal("hello"), Literal(""), -1), Seq("h", "e", "l", "l", "o"), row1) checkEvaluation( StringSplit(Literal(""), Literal(""), -1), Seq(""), row1) checkEvaluation( diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index c36791a046ab9..bc7941f659f24 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -287,7 +287,7 @@ | org.apache.spark.sql.catalyst.expressions.StringRepeat | repeat | SELECT repeat('123', 2) | struct | | org.apache.spark.sql.catalyst.expressions.StringReplace | replace | SELECT replace('ABCabc', 'abc', 'DEF') | struct | | org.apache.spark.sql.catalyst.expressions.StringSpace | space | SELECT concat(space(2), '1') | struct | -| org.apache.spark.sql.catalyst.expressions.StringSplit | split | SELECT split('oneAtwoBthreeC', '[ABC]') | struct> | +| org.apache.spark.sql.catalyst.expressions.StringSplit | split | SELECT split('oneAtwoBthreeC', '[ABC]') | struct> | | org.apache.spark.sql.catalyst.expressions.StringToMap | str_to_map | SELECT str_to_map('a:1,b:2,c:3', ',', ':') | struct> | | org.apache.spark.sql.catalyst.expressions.StringTranslate | translate | SELECT translate('AaBbCc', 'abc', '123') | struct | | org.apache.spark.sql.catalyst.expressions.StringTrim | trim | SELECT trim(' SparkSQL ') | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 99429b800712c..efbef2ab449ba 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -27,7 +27,6 @@ select right("abcd", -2), right("abcd", 0), right("abcd", 'a'); SELECT split('aa1cc2ee3', '[1-9]+'); SELECT split('aa1cc2ee3', '[1-9]+', 2); SELECT split('hello', ''); -SELECT split('hello', '', -1); SELECT split('', ''); SELECT split('abc', null); SELECT split(null, 'b'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out index f7a6fa585a253..c06e825a36def 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out @@ -272,6 +272,6 @@ struct> -- !query select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array(x))) -- !query schema -struct>> +struct>> -- !query output [["h"]] diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index d114eca13a765..548c52c96620b 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -142,7 +142,7 @@ org.apache.spark.SparkNumberFormatException -- !query SELECT split('aa1cc2ee3', '[1-9]+') -- !query schema -struct> +struct> -- !query output ["aa","cc","ee",""] @@ -158,23 +158,15 @@ struct> -- !query SELECT split('hello', '') -- !query schema -struct> --- !query output -["h","e","l","l","o"] - - --- !query -SELECT split('hello', '', -1) --- !query schema struct> -- !query output -["h","e","l","l","o",""] +["h","e","l","l","o"] -- !query SELECT split('', '') -- !query schema -struct> +struct> -- !query output [""] @@ -182,7 +174,7 @@ struct> -- !query SELECT split('abc', null) -- !query schema -struct> +struct> -- !query output NULL @@ -190,7 +182,7 @@ NULL -- !query SELECT split(null, 'b') -- !query schema -struct> +struct> -- !query output NULL diff --git a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out index ee5060f5adf7e..1f79817787a43 100644 --- a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out @@ -1054,7 +1054,7 @@ NetEase NetEase Spark- NetEase -- !query select split(c7, 'e'), split(c8, 'e'), split(v, 'a'), split(s, 'e') from char_tbl4 -- !query schema -struct,split(c8, e, 0):array,split(v, a, 0):array,split(s, e, 0):array> +struct,split(c8, e, -1):array,split(v, a, -1):array,split(s, e, -1):array> -- !query output NULL NULL NULL NULL NULL NULL ["S"] NULL diff --git a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out index f7a6fa585a253..c06e825a36def 100644 --- a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out @@ -272,6 +272,6 @@ struct> -- !query select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array(x))) -- !query schema -struct>> +struct>> -- !query output [["h"]] diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 0ec4f3645fb95..f7c8689e800cd 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -108,7 +108,7 @@ struct -- !query SELECT split('aa1cc2ee3', '[1-9]+') -- !query schema -struct> +struct> -- !query output ["aa","cc","ee",""] @@ -124,23 +124,15 @@ struct> -- !query SELECT split('hello', '') -- !query schema -struct> --- !query output -["h","e","l","l","o"] - - --- !query -SELECT split('hello', '', -1) --- !query schema struct> -- !query output -["h","e","l","l","o",""] +["h","e","l","l","o"] -- !query SELECT split('', '') -- !query schema -struct> +struct> -- !query output [""] @@ -148,7 +140,7 @@ struct> -- !query SELECT split('abc', null) -- !query schema -struct> +struct> -- !query output NULL @@ -156,7 +148,7 @@ NULL -- !query SELECT split(null, 'b') -- !query schema -struct> +struct> -- !query output NULL From 29344fcbd81d8950496df26d5a25eb223d9e0124 Mon Sep 17 00:00:00 2001 From: Vitalii Li Date: Wed, 24 Aug 2022 11:32:33 -0700 Subject: [PATCH 5/7] Update docs/sql-migration-guide.md Co-authored-by: Wenchen Fan --- docs/sql-migration-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 4f7d7908210e2..74334a7ce8d8f 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -28,7 +28,7 @@ license: | - Since Spark 3.4, v1 database, table, permanent view and function identifier will include 'spark_catalog' as the catalog name if database is defined, e.g. a table identifier will be: `spark_catalog.default.t`. To restore the legacy behavior, set `spark.sql.legacy.v1IdentifierNoCatalog` to `true`. - Since Spark 3.4, when ANSI SQL mode(configuration `spark.sql.ansi.enabled`) is on, Spark SQL always returns NULL result on getting a map value with a non-existing key. In Spark 3.3 or earlier, there will be an error. - Since Spark 3.4, the SQL CLI `spark-sql` does not print the prefix `Error in query:` before the error message of `AnalysisException`. - - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty and `limit` parameter is not specified or set to 0 or -1. + - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty and `limit` parameter is not specified or set to a non-positive number. ## Upgrading from Spark SQL 3.2 to 3.3 From 74b3f0b53c2694dd0e6b0cdb1e8e5a1dffa85219 Mon Sep 17 00:00:00 2001 From: Vitalii Li Date: Mon, 29 Aug 2022 16:42:48 -0700 Subject: [PATCH 6/7] Always ignore empty string for empty pattern regardless of limit --- .../java/org/apache/spark/unsafe/types/UTF8String.java | 7 ++++--- .../org/apache/spark/unsafe/types/UTF8StringSuite.java | 10 ++++++++++ docs/sql-migration-guide.md | 2 +- .../catalyst/expressions/RegexpExpressionsSuite.scala | 6 ++++++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 7a16b53974b07..6e08fec7e8d92 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1003,12 +1003,13 @@ public UTF8String[] split(UTF8String pattern, int limit) { // This is a special case for converting string into array of symbols without a trailing empty // string. E.g. `"hello".split("", 0) => ["h", "e", "l", "l", "o"]. // Note that negative limit will preserve a trailing empty string. - if (limit <= 0 && numBytes() != 0 && pattern.numBytes() == 0) { + if (numBytes() != 0 && pattern.numBytes() == 0) { + int newLimit = limit > numChars() || limit <= 0 ? numChars() : limit; byte[] input = getBytes(); int byteIndex = 0; int charIndex = 0; - UTF8String[] result = new UTF8String[numChars()]; - while (byteIndex < numBytes) { + UTF8String[] result = new UTF8String[newLimit]; + while (charIndex < newLimit) { int currCharNumBytes = numBytesForFirstByte(input[byteIndex]); result[charIndex++] = UTF8String.fromBytes(input, byteIndex, currCharNumBytes); byteIndex += currCharNumBytes; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index f42bd490b1829..8411d448dc9fd 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -409,12 +409,22 @@ public void split() { assertArrayEquals( new UTF8String[]{fromString("ab"), fromString("def,ghi,")}, fromString("ab,def,ghi,").split(fromString(","), 2)); + // Split with empty pattern ignores trailing empty spaces. assertArrayEquals( new UTF8String[]{fromString("a"), fromString("b")}, fromString("ab").split(fromString(""), 0)); assertArrayEquals( new UTF8String[]{fromString("a"), fromString("b")}, fromString("ab").split(fromString(""), -1)); + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b")}, + fromString("ab").split(fromString(""), 2)); + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b")}, + fromString("ab").split(fromString(""), 100)); + assertArrayEquals( + new UTF8String[]{fromString("a")}, + fromString("ab").split(fromString(""), 1)); assertArrayEquals( new UTF8String[]{fromString("")}, fromString("").split(fromString(""), 0)); diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 4f7d7908210e2..d69f245d8e8e3 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -28,7 +28,7 @@ license: | - Since Spark 3.4, v1 database, table, permanent view and function identifier will include 'spark_catalog' as the catalog name if database is defined, e.g. a table identifier will be: `spark_catalog.default.t`. To restore the legacy behavior, set `spark.sql.legacy.v1IdentifierNoCatalog` to `true`. - Since Spark 3.4, when ANSI SQL mode(configuration `spark.sql.ansi.enabled`) is on, Spark SQL always returns NULL result on getting a map value with a non-existing key. In Spark 3.3 or earlier, there will be an error. - Since Spark 3.4, the SQL CLI `spark-sql` does not print the prefix `Error in query:` before the error message of `AnalysisException`. - - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty and `limit` parameter is not specified or set to 0 or -1. + - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty. ## Upgrading from Spark SQL 3.2 to 3.3 diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 7c756daba42c0..9089963ee852c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -466,6 +466,12 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { StringSplit(Literal("hello"), Literal(""), 0), Seq("h", "e", "l", "l", "o"), row1) checkEvaluation( StringSplit(Literal("hello"), Literal(""), -1), Seq("h", "e", "l", "l", "o"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 5), Seq("h", "e", "l", "l", "o"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 3), Seq("h", "e", "l"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 100), Seq("h", "e", "l", "l", "o"), row1) checkEvaluation( StringSplit(Literal(""), Literal(""), -1), Seq(""), row1) checkEvaluation( From 7f6c0ddfbe23a8d78efa466a0d4313b3c3f4a874 Mon Sep 17 00:00:00 2001 From: Vitalii Li Date: Tue, 30 Aug 2022 09:36:55 -0700 Subject: [PATCH 7/7] Update comment for split function with empty pattern --- .../main/java/org/apache/spark/unsafe/types/UTF8String.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 6e08fec7e8d92..6a3e9d313b59a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1000,9 +1000,8 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { } public UTF8String[] split(UTF8String pattern, int limit) { - // This is a special case for converting string into array of symbols without a trailing empty - // string. E.g. `"hello".split("", 0) => ["h", "e", "l", "l", "o"]. - // Note that negative limit will preserve a trailing empty string. + // For the empty `pattern` a `split` function ignores trailing empty strings unless original + // string is empty. if (numBytes() != 0 && pattern.numBytes() == 0) { int newLimit = limit > numChars() || limit <= 0 ? numChars() : limit; byte[] input = getBytes();