From c4fa8d8c1f7bdcb2a2a5ebab43ea02b6d69206dd Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 20 Aug 2025 09:35:35 -0700 Subject: [PATCH 1/5] Support distinct_count/dc in eventstats Signed-off-by: Kai Huang --- .../function/BuiltinFunctionName.java | 1 + docs/user/ppl/cmd/eventstats.rst | 37 +++++ .../sql/calcite/remote/CalciteExplainIT.java | 21 +++ .../remote/CalcitePPLEventstatsIT.java | 129 ++++++++++++++---- .../calcite/explain_eventstats_dc.json | 6 + .../explain_eventstats_distinct_count.json | 6 + ppl/src/main/antlr/OpenSearchPPLParser.g4 | 6 + .../sql/ppl/parser/AstExpressionBuilder.java | 9 +- 8 files changed, 189 insertions(+), 26 deletions(-) create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_dc.json create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_distinct_count.json diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index 6e278c4192e..8affa3bd09c 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -365,6 +365,7 @@ public enum BuiltinFunctionName { .put("stddev_samp", BuiltinFunctionName.STDDEV_SAMP) // .put("earliest", BuiltinFunctionName.EARLIEST) // .put("latest", BuiltinFunctionName.LATEST) + .put("distinct_count_approx", BuiltinFunctionName.DISTINCT_COUNT_APPROX) .put("pattern", BuiltinFunctionName.INTERNAL_PATTERN) .build(); diff --git a/docs/user/ppl/cmd/eventstats.rst b/docs/user/ppl/cmd/eventstats.rst index 134b0ea7a9b..2f312dad3ac 100644 --- a/docs/user/ppl/cmd/eventstats.rst +++ b/docs/user/ppl/cmd/eventstats.rst @@ -277,6 +277,41 @@ Example:: +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+ +DISTINCT_COUNT, DC +------------------ + +Description +>>>>>>>>>>> + +Usage: DISTINCT_COUNT(expr), DC(expr). Returns the approximate number of distinct values of expr using HyperLogLog++ algorithm. Both ``DISTINCT_COUNT`` and ``DC`` are equivalent and provide the same functionality. + +Example:: + + PPL> source=accounts | eventstats dc(state) as distinct_states; + fetched rows / total rows = 4/4 + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | distinct_states | + |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 4 | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 4 | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 4 | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 4 | + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ + +Example with BY clause:: + + PPL> source=accounts | eventstats distinct_count(state) as distinct_states by gender; + fetched rows / total rows = 4/4 + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | distinct_states | + |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------| + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 1 | + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 3 | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 3 | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 3 | + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ + + Configuration ============= This command requires Calcite enabled. @@ -312,6 +347,8 @@ Eventstats:: source = table | where a < 50 | eventstats count(c) source = table | eventstats min(c), max(c) by b source = table | eventstats count(c) as count_by by b | where count_by > 1000 + source = table | eventstats dc(field) as distinct_count + source = table | eventstats distinct_count(category) by region Example 1: Calculate the average, sum and count of a field by group diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index 26290f92278..128221ff5a1 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -216,6 +216,27 @@ public void supportPushDownScriptOnTextField() throws IOException { assertJsonEqualsIgnoreId(expected, result); } + @Test + public void testEventstatsDistinctCountExplain() throws IOException { + Assume.assumeTrue("This test is only for push down enabled", isPushdownEnabled()); + String query = + "source=opensearch-sql_test_index_account | eventstats dc(state) as distinct_states"; + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_eventstats_dc.json"); + assertJsonEqualsIgnoreId(expected, result); + } + + @Test + public void testEventstatsDistinctCountFunctionExplain() throws IOException { + Assume.assumeTrue("This test is only for push down enabled", isPushdownEnabled()); + String query = + "source=opensearch-sql_test_index_account | eventstats distinct_count(state) as" + + " distinct_states by gender"; + var result = explainQueryToString(query); + String expected = loadFromFile("expectedOutput/calcite/explain_eventstats_distinct_count.json"); + assertJsonEqualsIgnoreId(expected, result); + } + /** * Executes the PPL query and returns the result as a string with windows-style line breaks * replaced with Unix-style ones. diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java index 66febcb29fa..15ad300ae6e 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLEventstatsIT.java @@ -13,8 +13,6 @@ import org.json.JSONObject; import org.junit.Ignore; import org.junit.jupiter.api.Test; -import org.opensearch.client.Request; -import org.opensearch.sql.legacy.TestsConstants; import org.opensearch.sql.ppl.PPLIntegTestCase; public class CalcitePPLEventstatsIT extends PPLIntegTestCase { @@ -290,28 +288,6 @@ public void testUnsupportedWindowFunctions() { } } - @Ignore("DC should fail in window function") - public void testDistinctCountShouldFail() throws IOException { - Request request1 = - new Request("PUT", "/" + TestsConstants.TEST_INDEX_STATE_COUNTRY + "/_doc/5?refresh=true"); - request1.setJsonEntity( - "{\"name\":\"Jim\",\"age\":27,\"state\":\"Ontario\",\"country\":\"Canada\",\"year\":2023,\"month\":4}"); - client().performRequest(request1); - JSONObject actual = - executeQuery( - String.format( - "source=%s | eventstats distinct_count(state) by country", - TEST_INDEX_STATE_COUNTRY)); - - verifyDataRows( - actual, - rows("John", "Canada", "Ontario", 4, 2023, 25, 3), - rows("Jane", "Canada", "Quebec", 4, 2023, 20, 3), - rows("Jim", "Canada", "Ontario", 4, 2023, 27, 3), - rows("Jake", "USA", "California", 4, 2023, 70, 2), - rows("Hello", "USA", "New York", 4, 2023, 30, 2)); - } - @Test public void testMultipleEventstat() throws IOException { JSONObject actual = @@ -599,6 +575,111 @@ public void testEventstatVarianceWithNullBy() throws IOException { rows("Hello", "USA", "New York", 4, 2023, 30, 20, 28.284271247461902, 400, 800)); } + @Test + public void testEventstatDistinctCount() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats dc(state) as dc_state", TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, 4), + rows("Jake", "USA", "California", 4, 2023, 70, 4), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4), + rows("Hello", "USA", "New York", 4, 2023, 30, 4)); + } + + @Test + public void testEventstatDistinctCountByCountry() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats dc(state) as dc_state by country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, 2), + rows("Jake", "USA", "California", 4, 2023, 70, 2), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2), + rows("Hello", "USA", "New York", 4, 2023, 30, 2)); + } + + @Test + public void testEventstatDistinctCountFunction() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats distinct_count(country) as dc_country", + TEST_INDEX_STATE_COUNTRY)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_country", "bigint")); + + verifyDataRows( + actual, + rows("John", "Canada", "Ontario", 4, 2023, 25, 2), + rows("Jake", "USA", "California", 4, 2023, 70, 2), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 2), + rows("Hello", "USA", "New York", 4, 2023, 30, 2)); + } + + @Test + public void testEventstatDistinctCountWithNull() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eventstats dc(state) as dc_state", + TEST_INDEX_STATE_COUNTRY_WITH_NULL)); + + verifySchemaInOrder( + actual, + schema("name", "string"), + schema("country", "string"), + schema("state", "string"), + schema("month", "int"), + schema("year", "int"), + schema("age", "int"), + schema("dc_state", "bigint")); + + verifyDataRows( + actual, + rows(null, "Canada", null, 4, 2023, 10, 4), + rows("Kevin", null, null, 4, 2023, null, 4), + rows("John", "Canada", "Ontario", 4, 2023, 25, 4), + rows("Jake", "USA", "California", 4, 2023, 70, 4), + rows("Jane", "Canada", "Quebec", 4, 2023, 20, 4), + rows("Hello", "USA", "New York", 4, 2023, 30, 4)); + } + @Ignore @Test public void testEventstatEarliestAndLatest() throws IOException { diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_dc.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_dc.json new file mode 100644 index 00000000000..d9d477a31ea --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_dc.json @@ -0,0 +1,6 @@ +{ + "calcite": { + "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[APPROX_DISTINCT_COUNT($7) OVER ()])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", + "physical": "EnumerableLimit(fetch=[10000])\n EnumerableWindow(window#0=[window(aggs [APPROX_DISTINCT_COUNT($7)])])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\",\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\"],\"excludes\":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)])\n" + } +} \ No newline at end of file diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_distinct_count.json b/integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_distinct_count.json new file mode 100644 index 00000000000..c63ee04426f --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_eventstats_distinct_count.json @@ -0,0 +1,6 @@ +{ + "calcite": { + "logical": "LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])\n LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], distinct_states=[APPROX_DISTINCT_COUNT($7) OVER (PARTITION BY $4)])\n CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])\n", + "physical": "EnumerableLimit(fetch=[10000])\n EnumerableWindow(window#0=[window(partition {4} aggs [APPROX_DISTINCT_COUNT($7)])])\n CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname]], OpenSearchRequestBuilder(sourceBuilder={\"from\":0,\"timeout\":\"1m\",\"_source\":{\"includes\":[\"account_number\",\"firstname\",\"address\",\"balance\",\"gender\",\"city\",\"employer\",\"state\",\"age\",\"email\",\"lastname\"],\"excludes\":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)])\n" + } +} \ No newline at end of file diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 3f75da1245e..18cc44fa29f 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -403,6 +403,7 @@ windowFunction windowFunctionName : statsFunctionName | scalarWindowFunctionName + | distinctCountWindowFunction ; scalarWindowFunctionName @@ -417,6 +418,11 @@ scalarWindowFunctionName | NTILE ; +distinctCountWindowFunction + : DISTINCT_COUNT + | DC + ; + // aggregation terms statsAggTerm : statsFunction (AS alias = wcFieldExpression)? diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java index cd5e3a76d8d..c9451a4f40e 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java @@ -567,8 +567,13 @@ public UnresolvedExpression visitBetween(OpenSearchPPLParser.BetweenContext ctx) @Override public UnresolvedExpression visitWindowFunction(OpenSearchPPLParser.WindowFunctionContext ctx) { - Function f = - buildFunction(ctx.windowFunctionName().getText(), ctx.functionArgs().functionArg()); + Function f; + if (ctx.windowFunctionName().distinctCountWindowFunction() != null) { + // All distinct count variants (dc, distinct_count, distinct_count_approx) use the UDAF + f = buildFunction("distinct_count_approx", ctx.functionArgs().functionArg()); + } else { + f = buildFunction(ctx.windowFunctionName().getText(), ctx.functionArgs().functionArg()); + } // In PPL eventstats command, all window functions have the same partition and order spec. return new WindowFunction(f); } From 10126c6c6ec1c8121a2ccb46d655c5f6ef50ab75 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 20 Aug 2025 12:52:50 -0700 Subject: [PATCH 2/5] Add UT and optimize code Signed-off-by: Kai Huang --- .../function/BuiltinFunctionName.java | 2 ++ .../expression/function/PPLFuncImpTable.java | 10 ++++++ docs/user/ppl/cmd/eventstats.rst | 31 +++++----------- ppl/src/main/antlr/OpenSearchPPLParser.g4 | 6 +--- .../sql/ppl/parser/AstExpressionBuilder.java | 9 ++--- .../ppl/calcite/CalcitePPLEventstatsTest.java | 35 +++++++++++++++++++ 6 files changed, 59 insertions(+), 34 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index 8affa3bd09c..de77a2a349a 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -366,6 +366,8 @@ public enum BuiltinFunctionName { // .put("earliest", BuiltinFunctionName.EARLIEST) // .put("latest", BuiltinFunctionName.LATEST) .put("distinct_count_approx", BuiltinFunctionName.DISTINCT_COUNT_APPROX) + .put("dc", BuiltinFunctionName.DISTINCT_COUNT_APPROX) + .put("distinct_count", BuiltinFunctionName.DISTINCT_COUNT_APPROX) .put("pattern", BuiltinFunctionName.INTERNAL_PATTERN) .build(); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index adee1fba502..7562d32522f 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -61,6 +61,7 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_WEEK; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_YEAR; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DEGREES; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.DISTINCT_COUNT_APPROX; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DIVIDE; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DIVIDEFUNCTION; import static org.opensearch.sql.expression.function.BuiltinFunctionName.E; @@ -1122,6 +1123,15 @@ void populate() { wrapSqlOperandTypeChecker( SqlStdOperatorTable.COUNT.getOperandTypeChecker(), COUNT.name(), false)); + register( + DISTINCT_COUNT_APPROX, + (distinct, field, argList, ctx) -> + ctx.relBuilder.aggregateCall(SqlStdOperatorTable.APPROX_COUNT_DISTINCT, field), + wrapSqlOperandTypeChecker( + SqlStdOperatorTable.APPROX_COUNT_DISTINCT.getOperandTypeChecker(), + DISTINCT_COUNT_APPROX.name(), + false)); + register( VARSAMP, (distinct, field, argList, ctx) -> ctx.relBuilder.aggregateCall(VAR_SAMP_NULLABLE, field), diff --git a/docs/user/ppl/cmd/eventstats.rst b/docs/user/ppl/cmd/eventstats.rst index 2f312dad3ac..de641cdf64a 100644 --- a/docs/user/ppl/cmd/eventstats.rst +++ b/docs/user/ppl/cmd/eventstats.rst @@ -287,29 +287,16 @@ Usage: DISTINCT_COUNT(expr), DC(expr). Returns the approximate number of distinc Example:: - PPL> source=accounts | eventstats dc(state) as distinct_states; + PPL> source=accounts | eventstats dc(state) as distinct_states, distinct_count(state) as dc_states_alt by gender; fetched rows / total rows = 4/4 - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | distinct_states | - |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------| - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 4 | - | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 4 | - | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 4 | - | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 4 | - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ - -Example with BY clause:: - - PPL> source=accounts | eventstats distinct_count(state) as distinct_states by gender; - fetched rows / total rows = 4/4 - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ - | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | distinct_states | - |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------| - | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 1 | - | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 3 | - | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 3 | - | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 3 | - +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+ + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+-----------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | distinct_states | dc_states_alt | + |----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------|-----------------| + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | 1 | 1 | + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | 3 | 3 | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | 3 | 3 | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | 3 | 3 | + +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+-----------------+-----------------+ Configuration diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 18cc44fa29f..40d5b662a2e 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -403,7 +403,6 @@ windowFunction windowFunctionName : statsFunctionName | scalarWindowFunctionName - | distinctCountWindowFunction ; scalarWindowFunctionName @@ -416,10 +415,7 @@ scalarWindowFunctionName | LAST | NTH | NTILE - ; - -distinctCountWindowFunction - : DISTINCT_COUNT + | DISTINCT_COUNT | DC ; diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java index c9451a4f40e..cd5e3a76d8d 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java @@ -567,13 +567,8 @@ public UnresolvedExpression visitBetween(OpenSearchPPLParser.BetweenContext ctx) @Override public UnresolvedExpression visitWindowFunction(OpenSearchPPLParser.WindowFunctionContext ctx) { - Function f; - if (ctx.windowFunctionName().distinctCountWindowFunction() != null) { - // All distinct count variants (dc, distinct_count, distinct_count_approx) use the UDAF - f = buildFunction("distinct_count_approx", ctx.functionArgs().functionArg()); - } else { - f = buildFunction(ctx.windowFunctionName().getText(), ctx.functionArgs().functionArg()); - } + Function f = + buildFunction(ctx.windowFunctionName().getText(), ctx.functionArgs().functionArg()); // In PPL eventstats command, all window functions have the same partition and order spec. return new WindowFunction(f); } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java index cd808621407..d5f1b2c07cc 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java @@ -70,4 +70,39 @@ public void testEventstatsAvg() { + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } + + @Test + public void testEventstatsDistinctCount() { + String ppl = "source=EMP | eventstats dc(DEPTNO)"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], dc(DEPTNO)=[APPROX_COUNT_DISTINCT($7) OVER ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + } + + @Test + public void testEventstatsDistinctCountFunction() { + String ppl = "source=EMP | eventstats distinct_count(DEPTNO)"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], distinct_count(DEPTNO)=[APPROX_COUNT_DISTINCT($7) OVER" + + " ()])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + } + + @Test + public void testEventstatsDistinctCountWithPartition() { + String ppl = "source=EMP | eventstats dc(JOB) by DEPTNO"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], dc(JOB)=[APPROX_COUNT_DISTINCT($2) OVER (PARTITION BY" + + " $7)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + } } From e9e156918e0757e6fc345274460df51d7b56233f Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 21 Aug 2025 23:23:38 -0700 Subject: [PATCH 3/5] doc Signed-off-by: Kai Huang --- docs/user/ppl/cmd/eventstats.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/ppl/cmd/eventstats.rst b/docs/user/ppl/cmd/eventstats.rst index de641cdf64a..c075d87ce04 100644 --- a/docs/user/ppl/cmd/eventstats.rst +++ b/docs/user/ppl/cmd/eventstats.rst @@ -277,7 +277,7 @@ Example:: +----------------+-----------+----------------------+---------+--------+--------+----------+-------+-----+-----------------------+----------+--------------------+ -DISTINCT_COUNT, DC +DISTINCT_COUNT, DC(Since 3.3) ------------------ Description From d089e9ff374633c72e804159519823561509fec0 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 21 Aug 2025 23:32:27 -0700 Subject: [PATCH 4/5] fix CI Signed-off-by: Kai Huang --- .../org/opensearch/sql/calcite/remote/CalciteExplainIT.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index ec29655f1e2..4bdf112c018 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -236,6 +236,8 @@ public void testEventstatsDistinctCountFunctionExplain() throws IOException { var result = explainQueryToString(query); String expected = loadFromFile("expectedOutput/calcite/explain_eventstats_distinct_count.json"); assertJsonEqualsIgnoreId(expected, result); + } + // Only for Calcite, as v2 gets unstable serialized string for function @Test public void testExplainOnAggregationWithSumEnhancement() throws IOException { From 8b5e7885bff680b3f8a03fcb90522ec7bebb57a2 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 21 Aug 2025 23:57:09 -0700 Subject: [PATCH 5/5] remove unit test and registration Signed-off-by: Kai Huang --- .../expression/function/PPLFuncImpTable.java | 10 ------ .../ppl/calcite/CalcitePPLEventstatsTest.java | 35 ------------------- 2 files changed, 45 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index 7562d32522f..adee1fba502 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -61,7 +61,6 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_WEEK; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_YEAR; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DEGREES; -import static org.opensearch.sql.expression.function.BuiltinFunctionName.DISTINCT_COUNT_APPROX; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DIVIDE; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DIVIDEFUNCTION; import static org.opensearch.sql.expression.function.BuiltinFunctionName.E; @@ -1123,15 +1122,6 @@ void populate() { wrapSqlOperandTypeChecker( SqlStdOperatorTable.COUNT.getOperandTypeChecker(), COUNT.name(), false)); - register( - DISTINCT_COUNT_APPROX, - (distinct, field, argList, ctx) -> - ctx.relBuilder.aggregateCall(SqlStdOperatorTable.APPROX_COUNT_DISTINCT, field), - wrapSqlOperandTypeChecker( - SqlStdOperatorTable.APPROX_COUNT_DISTINCT.getOperandTypeChecker(), - DISTINCT_COUNT_APPROX.name(), - false)); - register( VARSAMP, (distinct, field, argList, ctx) -> ctx.relBuilder.aggregateCall(VAR_SAMP_NULLABLE, field), diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java index d5f1b2c07cc..cd808621407 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLEventstatsTest.java @@ -70,39 +70,4 @@ public void testEventstatsAvg() { + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } - - @Test - public void testEventstatsDistinctCount() { - String ppl = "source=EMP | eventstats dc(DEPTNO)"; - RelNode root = getRelNode(ppl); - String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," - + " COMM=[$6], DEPTNO=[$7], dc(DEPTNO)=[APPROX_COUNT_DISTINCT($7) OVER ()])\n" - + " LogicalTableScan(table=[[scott, EMP]])\n"; - verifyLogical(root, expectedLogical); - } - - @Test - public void testEventstatsDistinctCountFunction() { - String ppl = "source=EMP | eventstats distinct_count(DEPTNO)"; - RelNode root = getRelNode(ppl); - String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," - + " COMM=[$6], DEPTNO=[$7], distinct_count(DEPTNO)=[APPROX_COUNT_DISTINCT($7) OVER" - + " ()])\n" - + " LogicalTableScan(table=[[scott, EMP]])\n"; - verifyLogical(root, expectedLogical); - } - - @Test - public void testEventstatsDistinctCountWithPartition() { - String ppl = "source=EMP | eventstats dc(JOB) by DEPTNO"; - RelNode root = getRelNode(ppl); - String expectedLogical = - "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," - + " COMM=[$6], DEPTNO=[$7], dc(JOB)=[APPROX_COUNT_DISTINCT($2) OVER (PARTITION BY" - + " $7)])\n" - + " LogicalTableScan(table=[[scott, EMP]])\n"; - verifyLogical(root, expectedLogical); - } }