From bd71e692c80d2e7a4596f3f19753820cb8678086 Mon Sep 17 00:00:00 2001 From: Tanay Kulkarni Date: Mon, 21 Jul 2025 17:32:00 +0530 Subject: [PATCH 1/4] Added CONCAT_WS mapping --- sqlglot/dialects/e6.py | 52 +++++++++++++++++++++++++++++++++++++++ tests/dialects/test_e6.py | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/sqlglot/dialects/e6.py b/sqlglot/dialects/e6.py index a77a47de59..d37af74d15 100644 --- a/sqlglot/dialects/e6.py +++ b/sqlglot/dialects/e6.py @@ -2033,6 +2033,57 @@ def string_agg_sql(self: E6.Generator, expression: exp.GroupConcat) -> str: # Generate SQL using STRING_AGG/LISTAGG, with separator or default '' return self.func("LISTAGG", expr_1, separator or exp.Literal.string("")) + def concat_ws_sql(self: E6.Generator, expression: exp.ConcatWs) -> str: + """ + Generate the SQL for the CONCAT_WS function in E6. + + Implements Databricks CONCAT_WS behavior: + - If sep is NULL the result is NULL (handled by e6 engine) + - exprN that are NULL are ignored + - If only separator provided or all exprN are NULL, returns empty string + - Each exprN can be STRING or ARRAY of STRING + - NULLs within arrays are filtered out + - Arrays are flattened and individual elements joined with separator + """ + if not expression.expressions: + return "''" + + # Extract separator and arguments + separator = expression.expressions[0] + args = expression.expressions[1:] if len(expression.expressions) > 1 else [] + + # If no arguments provided (only separator), return empty string + if not args: + return "''" + + # Process arguments: collect all individual elements + all_elements = [] + + for arg in args: + if isinstance(arg, exp.Array): + # For array arguments: extract individual elements and filter NULLs + # array('S', 'Q', NULL, 'L') becomes individual elements 'S', 'Q', 'L' + for element in arg.expressions: + if not isinstance(element, exp.Null): + all_elements.append(self.sql(element)) + else: + # For string arguments: add if not NULL + if not isinstance(arg, exp.Null): + all_elements.append(self.sql(arg)) + + # If no elements after filtering, return empty string + if not all_elements: + return "''" + + if len(all_elements) == 1: + # Single element case + return all_elements[0] + + # Multiple elements: create array and join with separator + # Build: ARRAY_TO_STRING(ARRAY[element1, element2, ...], separator) + elements_list = ", ".join(all_elements) + return f"ARRAY_TO_STRING(ARRAY[{elements_list}], {self.sql(separator)})" + # def struct_sql(self, expression: exp.Struct) -> str: # struct_expr = expression.expressions # return f"{struct_expr}" @@ -2247,6 +2298,7 @@ def split_sql(self, expression: exp.Split | exp.RegexpSplit): # We mapped this believing that for most of the cases, # CONCAT function in other dialects would mostly use for ARRAY concatenation exp.Concat: rename_func("CONCAT"), + exp.ConcatWs: concat_ws_sql, exp.Contains: rename_func("CONTAINS_SUBSTR"), exp.CurrentDate: lambda *_: "CURRENT_DATE", exp.CurrentTimestamp: lambda *_: "CURRENT_TIMESTAMP", diff --git a/tests/dialects/test_e6.py b/tests/dialects/test_e6.py index 3cd485d60f..536e8212c2 100644 --- a/tests/dialects/test_e6.py +++ b/tests/dialects/test_e6.py @@ -1604,6 +1604,49 @@ def test_string(self): read={"databricks": "SELECT to_varchar(x'537061726b2053514c', 'hex')"}, ) + # CONCAT_WS tests - based on Databricks documentation + # Basic string concatenation: concat_ws(' ', 'Spark', 'SQL') -> 'Spark SQL' + self.validate_all( + "SELECT ARRAY_TO_STRING(ARRAY['Spark', 'SQL'], ' ')", + read={"databricks": "SELECT concat_ws(' ', 'Spark', 'SQL')"}, + ) + + # Only separator provided: concat_ws('s') -> '' + self.validate_all( + "SELECT ''", + read={"databricks": "SELECT concat_ws('s')"}, + ) + + # Mixed strings, arrays and NULLs: concat_ws(',', 'Spark', array('S', 'Q', NULL, 'L'), NULL) -> 'Spark,S,Q,L' + self.validate_all( + "SELECT ARRAY_TO_STRING(ARRAY['Spark', 'S', 'Q', 'L'], ',')", + read={"databricks": "SELECT concat_ws(',', 'Spark', array('S', 'Q', NULL, 'L'), NULL)"}, + ) + + # Single string argument with separator + self.validate_all( + "SELECT 'test'", + read={"databricks": "SELECT concat_ws('-', 'test')"}, + ) + + # Multiple string arguments + self.validate_all( + "SELECT ARRAY_TO_STRING(ARRAY['a', 'b', 'c'], '-')", + read={"databricks": "SELECT concat_ws('-', 'a', 'b', 'c')"}, + ) + + # Empty separator + self.validate_all( + "SELECT ARRAY_TO_STRING(ARRAY['hello', 'world'], '')", + read={"databricks": "SELECT concat_ws('', 'hello', 'world')"}, + ) + + # Array with all valid elements (no NULLs) + self.validate_all( + "SELECT ARRAY_TO_STRING(ARRAY['x', 'y', 'z'], '|')", + read={"databricks": "SELECT concat_ws('|', array('x', 'y', 'z'))"}, + ) + def test_to_utf(self): self.validate_all( "TO_UTF8(x)", From 0501b84abc6619e4e454accfbf34cecdf742c76b Mon Sep 17 00:00:00 2001 From: Tanay Kulkarni Date: Mon, 21 Jul 2025 17:48:35 +0530 Subject: [PATCH 2/4] Added CONCAT_WS in apis/utils/supported_functions_in_all_dialects.json --- apis/utils/supported_functions_in_all_dialects.json | 1 + 1 file changed, 1 insertion(+) diff --git a/apis/utils/supported_functions_in_all_dialects.json b/apis/utils/supported_functions_in_all_dialects.json index d46d5af1ff..096d9f63de 100644 --- a/apis/utils/supported_functions_in_all_dialects.json +++ b/apis/utils/supported_functions_in_all_dialects.json @@ -643,6 +643,7 @@ "CAST", "SIGN", "CONCAT", + "CONCAT_WS", "USING", "ACOSH", "INITCAP", From bafb043a430dd6ac9b3f45e0972e6b127a05e87e Mon Sep 17 00:00:00 2001 From: Tanay Kulkarni Date: Thu, 24 Jul 2025 16:55:01 +0530 Subject: [PATCH 3/4] Remove CONCAT_WS --- apis/utils/supported_functions_in_all_dialects.json | 1 - 1 file changed, 1 deletion(-) diff --git a/apis/utils/supported_functions_in_all_dialects.json b/apis/utils/supported_functions_in_all_dialects.json index 096d9f63de..d46d5af1ff 100644 --- a/apis/utils/supported_functions_in_all_dialects.json +++ b/apis/utils/supported_functions_in_all_dialects.json @@ -643,7 +643,6 @@ "CAST", "SIGN", "CONCAT", - "CONCAT_WS", "USING", "ACOSH", "INITCAP", From ff408867bf7bdda49c91d599adbbda448d4c4ff5 Mon Sep 17 00:00:00 2001 From: Tanay Kulkarni Date: Mon, 4 Aug 2025 16:24:51 +0530 Subject: [PATCH 4/4] refactor: use expression nodes instead of hardcoded ARRAY string in CONCAT_WS - Replace hardcoded 'ARRAY' string with proper exp.Array expression node - Use self.func('ARRAY_TO_STRING', ...) directly to avoid ARRAY_JOIN mapping - Maintain all Databricks CONCAT_WS behaviors (NULL filtering, array flattening) --- sqlglot/dialects/e6.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/sqlglot/dialects/e6.py b/sqlglot/dialects/e6.py index d37af74d15..55f4ad6283 100644 --- a/sqlglot/dialects/e6.py +++ b/sqlglot/dialects/e6.py @@ -2056,33 +2056,36 @@ def concat_ws_sql(self: E6.Generator, expression: exp.ConcatWs) -> str: if not args: return "''" - # Process arguments: collect all individual elements - all_elements = [] + # Collect all non-NULL expression nodes (flattening arrays) + array_expressions = [] for arg in args: if isinstance(arg, exp.Array): - # For array arguments: extract individual elements and filter NULLs - # array('S', 'Q', NULL, 'L') becomes individual elements 'S', 'Q', 'L' + # For array arguments: add non-NULL elements for element in arg.expressions: if not isinstance(element, exp.Null): - all_elements.append(self.sql(element)) + array_expressions.append(element) else: # For string arguments: add if not NULL if not isinstance(arg, exp.Null): - all_elements.append(self.sql(arg)) + array_expressions.append(arg) # If no elements after filtering, return empty string - if not all_elements: + if not array_expressions: return "''" - if len(all_elements) == 1: - # Single element case - return all_elements[0] + # Single element case - just return the element + if len(array_expressions) == 1: + return self.sql(array_expressions[0]) # Multiple elements: create array and join with separator # Build: ARRAY_TO_STRING(ARRAY[element1, element2, ...], separator) - elements_list = ", ".join(all_elements) - return f"ARRAY_TO_STRING(ARRAY[{elements_list}], {self.sql(separator)})" + # Create Array expression with the actual expression nodes + array_expr = exp.Array(expressions=array_expressions) + + # Use ARRAY_TO_STRING function directly instead of exp.ArrayToString + # to avoid the ARRAY_JOIN mapping in TRANSFORMS + return self.func("ARRAY_TO_STRING", array_expr, separator) # def struct_sql(self, expression: exp.Struct) -> str: # struct_expr = expression.expressions