apache · sitelight · Apr 14, 2026 · Apr 14, 2026
diff --git a/superset/sql/parse.py b/superset/sql/parse.py
@@ -32,9 +32,11 @@
 from sqlglot.dialects.dialect import (
     Dialect,
     Dialects,
+    DialectType,
 )
 from sqlglot.dialects.singlestore import SingleStore
 from sqlglot.errors import ParseError
+from sqlglot.generator import Generator
 from sqlglot.optimizer.pushdown_predicates import (
     pushdown_predicates,
 )
@@ -135,6 +137,29 @@ class CTASMethod(enum.Enum):
     VIEW = enum.auto()
 
 
+def _normalized_generator(
+    dialect_name: DialectType,
+    *,
+    pretty: bool,
+    comments: bool,
+) -> Generator:
+    """
+    Build a sqlglot generator that preserves user-written multi-argument
+    DISTINCT expressions verbatim. Postgres, Presto, Trino, DuckDB and Dremio
+    set ``MULTI_ARG_DISTINCT = False`` to emulate the unsupported
+    ``COUNT(DISTINCT a, b)`` idiom via a ``CASE WHEN`` row-expression, which
+    silently corrupts user-defined aggregates that natively accept multiple
+    arguments. Superset's sanitize / format paths normalize user SQL — they
+    do not transpile — so the emulation is undesirable here.
+    """
+    generator = Dialect.get_or_raise(dialect_name).generator(
+        pretty=pretty,
+        comments=comments,
+    )
+    generator.MULTI_ARG_DISTINCT = True
+    return generator
+
+
 class RLSMethod(enum.Enum):
     """
     Methods for enforcing RLS.
@@ -723,12 +748,11 @@ def format(self, comments: bool = True) -> str:
         """
         Pretty-format the SQL statement.
         """
-        return Dialect.get_or_raise(self._dialect).generate(
-            self._parsed,
-            copy=True,
-            comments=comments,
+        return _normalized_generator(
+            self._dialect,
             pretty=True,
-        )
+            comments=comments,
+        ).generate(self._parsed, copy=True)
 
     def get_settings(self) -> dict[str, str | bool]:
         """
@@ -1551,14 +1575,13 @@ def sanitize_clause(clause: str, engine: str) -> str:
     """
     try:
         statement = SQLStatement(clause, engine)
-        dialect = SQLGLOT_DIALECTS.get(engine)
-        from sqlglot.dialects.dialect import Dialect
-
-        return Dialect.get_or_raise(dialect).generate(
+        return _normalized_generator(
+            SQLGLOT_DIALECTS.get(engine),
+            pretty=False,
+            comments=False,
+        ).generate(
             statement._parsed,  # pylint: disable=protected-access
             copy=True,
-            comments=False,
-            pretty=False,
         )
     except SupersetParseError as ex:
         raise QueryClauseValidationException(f"Invalid SQL clause: {clause}") from ex

diff --git a/tests/unit_tests/sql/parse_tests.py b/tests/unit_tests/sql/parse_tests.py
@@ -2704,6 +2704,46 @@ def test_is_valid_cvas(sql: str, engine: str, expected: bool) -> None:
         ("col1 = 1) AND (col2 = 2", QueryClauseValidationException, "base"),
         ("(col1 = 1)) AND ((col2 = 2)", QueryClauseValidationException, "base"),
         ("TRUE; SELECT 1", QueryClauseValidationException, "base"),
+        # Regression test for https://github.com/apache/superset/issues/39223:
+        # dialects with `MULTI_ARG_DISTINCT=False` (Postgres, Presto, Trino,
+        # DuckDB, Dremio) must not rewrite user-defined multi-argument DISTINCT
+        # aggregates into row-expression null guards.
+        (
+            "DISTINCT_AVG(DISTINCT report_id, time_to_accept / 86400)",
+            "DISTINCT_AVG(DISTINCT report_id, time_to_accept / 86400)",
+            "postgresql",
+        ),
+        (
+            "DISTINCT_SUM(DISTINCT report_id, total_bounty_reward_amount)",
+            "DISTINCT_SUM(DISTINCT report_id, total_bounty_reward_amount)",
+            "postgresql",
+        ),
+        (
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "presto",
+        ),
+        (
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "trino",
+        ),
+        (
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "duckdb",
+        ),
+        (
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "DISTINCT_AVG(DISTINCT k, v)",
+            "dremio",
+        ),
+        # Single-argument DISTINCT must still round-trip cleanly.
+        (
+            "COUNT(DISTINCT x)",
+            "COUNT(DISTINCT x)",
+            "postgresql",
+        ),
     ],
 )
 def test_sanitize_clause(sql: str, expected: str | Exception, engine: str) -> None:
@@ -2717,6 +2757,30 @@ def test_sanitize_clause(sql: str, expected: str | Exception, engine: str) -> No
             sanitize_clause(sql, engine)
 
 
+@pytest.mark.parametrize(
+    "engine",
+    [
+        "postgresql",
+        "presto",
+        "trino",
+        "duckdb",
+        "dremio",
+    ],
+)
+def test_sqlstatement_format_preserves_multi_arg_distinct(engine: str) -> None:
+    """
+    Regression guard for https://github.com/apache/superset/issues/39223:
+    ``SQLStatement.format()`` must not rewrite user-defined multi-argument
+    DISTINCT aggregates into row-expression null guards. This is the SQL Lab /
+    executor path; the metric-expression path is covered by
+    ``test_sanitize_clause``.
+    """
+    sql = "SELECT DISTINCT_AVG(DISTINCT a, b) FROM t"
+    formatted = SQLScript(sql, engine).format()
+    assert "DISTINCT_AVG(DISTINCT a, b)" in formatted
+    assert "CASE WHEN" not in formatted
+
+
 @pytest.mark.parametrize(
     "engine",
     [