From 15255129c6b75a0e86a6516a48f7a5a149d7c41d Mon Sep 17 00:00:00 2001 From: tkaunlaky-e6 Date: Mon, 2 Mar 2026 14:46:23 +0530 Subject: [PATCH 1/3] Preserve French and EU language characters in normalize_unicode_spaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The isascii() check was replacing ALL non-ASCII characters with spaces, which corrupted French chars like é, ç, ü (e.g. Téléchargement became T l chargement). Now uses unicodedata.category() to only normalize actual Unicode whitespace/separators (Zs, Zl, Zp) and U+FFFD, preserving all letter characters from EU languages. --- apis/utils/helpers.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/apis/utils/helpers.py b/apis/utils/helpers.py index fcbc511c73..7dbfcf7ea7 100644 --- a/apis/utils/helpers.py +++ b/apis/utils/helpers.py @@ -10,7 +10,6 @@ from sqlglot import exp, parse_one import typing as t from sqlglot.dialects.e6 import E6 -from curses.ascii import isascii FUNCTIONS_FILE = os.path.join(os.path.dirname(__file__), "supported_functions_in_all_dialects.json") logger = logging.getLogger(__name__) @@ -564,15 +563,13 @@ def normalize_unicode_spaces(sql: str) -> str: in_quote = ch out_chars.append(ch) else: - # Normalize replacement-char - if not isascii(ch): + # Normalize only Unicode whitespace/separators and U+FFFD, + # preserve all other non-ASCII chars (French, German, EU languages, etc.) + cat = unicodedata.category(ch) + if ch == "\uFFFD" or cat in ("Zs", "Zl", "Zp") or (ch.isspace() and ch not in "\r\n"): out_chars.append(" ") else: - cat = unicodedata.category(ch) - if (cat in ("Zs", "Zl", "Zp")) or (ch.isspace() and ch not in "\r\n"): - out_chars.append(" ") - else: - out_chars.append(ch) + out_chars.append(ch) i += 1 return "".join(out_chars) From 7cb6b75f2ca2bb0a970f3edf78270d151e887e55 Mon Sep 17 00:00:00 2001 From: tkaunlaky-e6 Date: Mon, 2 Mar 2026 14:51:15 +0530 Subject: [PATCH 2/3] Apply ruff-format linter fixes --- apis/utils/helpers.py | 6 +++++- sqlglot/transforms.py | 12 ++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apis/utils/helpers.py b/apis/utils/helpers.py index 7dbfcf7ea7..c43e43b23b 100644 --- a/apis/utils/helpers.py +++ b/apis/utils/helpers.py @@ -566,7 +566,11 @@ def normalize_unicode_spaces(sql: str) -> str: # Normalize only Unicode whitespace/separators and U+FFFD, # preserve all other non-ASCII chars (French, German, EU languages, etc.) cat = unicodedata.category(ch) - if ch == "\uFFFD" or cat in ("Zs", "Zl", "Zp") or (ch.isspace() and ch not in "\r\n"): + if ( + ch == "\ufffd" + or cat in ("Zs", "Zl", "Zp") + or (ch.isspace() and ch not in "\r\n") + ): out_chars.append(" ") else: out_chars.append(ch) diff --git a/sqlglot/transforms.py b/sqlglot/transforms.py index 8a83023056..8b787250f1 100644 --- a/sqlglot/transforms.py +++ b/sqlglot/transforms.py @@ -936,9 +936,9 @@ def eliminate_join_marks(expression: exp.Expression) -> exp.Expression: if not left_join_table: continue - assert not ( - len(left_join_table) > 1 - ), "Cannot combine JOIN predicates from different tables" + assert not (len(left_join_table) > 1), ( + "Cannot combine JOIN predicates from different tables" + ) for col in join_cols: col.set("join_mark", False) @@ -968,9 +968,9 @@ def eliminate_join_marks(expression: exp.Expression) -> exp.Expression: if query_from.alias_or_name in new_joins: only_old_joins = old_joins.keys() - new_joins.keys() - assert ( - len(only_old_joins) >= 1 - ), "Cannot determine which table to use in the new FROM clause" + assert len(only_old_joins) >= 1, ( + "Cannot determine which table to use in the new FROM clause" + ) new_from_name = list(only_old_joins)[0] query.set("from", exp.From(this=old_joins[new_from_name].this)) From 19590a4d8c083b7a79f1d61f45347d57acf8c590 Mon Sep 17 00:00:00 2001 From: tkaunlaky-e6 Date: Mon, 2 Mar 2026 19:05:00 +0530 Subject: [PATCH 3/3] Fix ruff-format for CI (ruff 0.7.2 assert formatting) --- sqlglot/transforms.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sqlglot/transforms.py b/sqlglot/transforms.py index 8b787250f1..8a83023056 100644 --- a/sqlglot/transforms.py +++ b/sqlglot/transforms.py @@ -936,9 +936,9 @@ def eliminate_join_marks(expression: exp.Expression) -> exp.Expression: if not left_join_table: continue - assert not (len(left_join_table) > 1), ( - "Cannot combine JOIN predicates from different tables" - ) + assert not ( + len(left_join_table) > 1 + ), "Cannot combine JOIN predicates from different tables" for col in join_cols: col.set("join_mark", False) @@ -968,9 +968,9 @@ def eliminate_join_marks(expression: exp.Expression) -> exp.Expression: if query_from.alias_or_name in new_joins: only_old_joins = old_joins.keys() - new_joins.keys() - assert len(only_old_joins) >= 1, ( - "Cannot determine which table to use in the new FROM clause" - ) + assert ( + len(only_old_joins) >= 1 + ), "Cannot determine which table to use in the new FROM clause" new_from_name = list(only_old_joins)[0] query.set("from", exp.From(this=old_joins[new_from_name].this))