From 4ee35653599842bc1bfdb68b4d22bf3d5f0b8433 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Tue, 14 Jan 2025 19:33:57 +0800 Subject: [PATCH 1/4] fix get_json_object diff --- .../execution/GlutenFunctionValidateSuite.scala | 7 +++++++ .../Functions/SparkFunctionGetJsonObject.h | 16 +++++++++++----- .../clickhouse/ClickHouseTestSettings.scala | 10 ---------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index d900bc000c05..cb51555b48ad 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -371,6 +371,13 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS " get_json_object(string_field1, '$.a') is not null") { _ => } } + test("Test get_json_object 12") { + runQueryAndCompare( + "SELECT get_json_object(string_field1, '$.a[*].y') from json_test where int_field1 = 7") { + _ => + } + } + test("Test covar_samp") { runQueryAndCompare("SELECT covar_samp(double_field1, int_field1) from json_test") { _ => } } diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h index fa9b78194be0..0f8ef41845db 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h @@ -462,8 +462,8 @@ class GetJsonObjectImpl static size_t getNumberOfIndexArguments(const DB::ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; } - bool insertResultToColumn(DB::IColumn & dest, const Element & root, DB::GeneratorJSONPath & generator_json_path, bool) - { + bool insertResultToColumn(DB::IColumn & dest, const Element & root, DB::GeneratorJSONPath & generator_json_path, bool path_has_asterisk) + { Element current_element = root; DB::VisitorStatus status; std::stringstream out; // STYLE_CHECK_ALLOW_STD_STRING_STREAM @@ -492,7 +492,7 @@ class GetJsonObjectImpl DB::ColumnNullable & nullable_col_str = assert_cast(dest); DB::ColumnString * col_str = assert_cast(&nullable_col_str.getNestedColumn()); JSONStringSerializer serializer(*col_str); - if (elements.size() == 1) [[likely]] + if (elements.size() == 1 && (!path_has_asterisk || elements[0].isArray())) [[likely]] { if (elements[0].isNull()) return false; @@ -504,7 +504,7 @@ class GetJsonObjectImpl serializer.addRawString(str); } else - { + { serializer.addElement(elements[0]); } } @@ -684,6 +684,7 @@ class FlattenJSONStringOnRequiredFunction : public DB::IFunction std::vector json_path_asts; std::vector required_fields; + std::vector path_has_asterisk; const auto & first_column = arguments[0]; if (const auto * required_fields_col = typeid_cast(arguments[1].column.get())) { @@ -694,6 +695,11 @@ class FlattenJSONStringOnRequiredFunction : public DB::IFunction { auto normalized_field = JSONPathNormalizer::normalize(field); // LOG_ERROR(getLogger("JSONPatch"), "xxx field {} -> {}", field, normalized_field); + if(normalized_field.find("[*]") != std::string::npos) + path_has_asterisk.emplace_back(true); + else + path_has_asterisk.emplace_back(false); + required_fields.push_back(normalized_field); tuple_columns.emplace_back(str_type->createColumn()); @@ -776,7 +782,7 @@ class FlattenJSONStringOnRequiredFunction : public DB::IFunction for (size_t j = 0; j < tuple_size; ++j) { generator_json_paths[j]->reinitialize(); - if (!impl.insertResultToColumn(*tuple_columns[j], document, *generator_json_paths[j], true)) + if (!impl.insertResultToColumn(*tuple_columns[j], document, *generator_json_paths[j], path_has_asterisk[j])) { tuple_columns[j]->insertDefault(); } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index f91841b991c7..9fedfe940db4 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -385,16 +385,6 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite].exclude( "SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") - .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") .exclude("from_json - invalid data") .exclude("from_json - input=object, schema=array, output=array of single row") .exclude("from_json - input=empty object, schema=array, output=array of single row with null") From 19aced3c9bfde34aec3931d79c0370d7a03cf4fa Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Wed, 15 Jan 2025 10:09:12 +0800 Subject: [PATCH 2/4] fix get_json_object --- .../gluten/execution/GlutenFunctionValidateSuite.scala | 4 ++++ .../local-engine/Functions/SparkFunctionGetJsonObject.h | 6 +++++- .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 9 --------- .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 9 --------- .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 9 --------- 5 files changed, 9 insertions(+), 28 deletions(-) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index cb51555b48ad..6367afc27510 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -376,6 +376,10 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS "SELECT get_json_object(string_field1, '$.a[*].y') from json_test where int_field1 = 7") { _ => } + runQueryAndCompare( + "select get_json_object(string_field1, '$.a[*].z.n.p') from json_test where int_field1 = 7") { + _ => + } } test("Test covar_samp") { diff --git a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h index 0f8ef41845db..1399b422856c 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionGetJsonObject.h @@ -492,7 +492,7 @@ class GetJsonObjectImpl DB::ColumnNullable & nullable_col_str = assert_cast(dest); DB::ColumnString * col_str = assert_cast(&nullable_col_str.getNestedColumn()); JSONStringSerializer serializer(*col_str); - if (elements.size() == 1 && (!path_has_asterisk || elements[0].isArray())) [[likely]] + if (elements.size() == 1) [[likely]] { if (elements[0].isNull()) return false; @@ -501,6 +501,10 @@ class GetJsonObjectImpl if (elements[0].isString()) { auto str = elements[0].getString(); + if (path_has_asterisk) + { + str = "\"" + std::string(str) + "\""; + } serializer.addRawString(str); } else diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 8c62e3b0fd9b..a8eaab5486ca 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -757,16 +757,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-35728: Check multiply/divide of day-time intervals of any fields by numeric") .exclude("SPARK-35778: Check multiply/divide of year-month intervals of any fields by numeric") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") .exclude("preserve newlines") .exclude("escape") .exclude("$..no_recursive") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 9ebcadf53118..2c315d8aae89 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -385,16 +385,7 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite].exclude( "SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") .exclude("from_json - invalid data") .exclude("from_json - input=object, schema=array, output=array of single row") .exclude("from_json - input=empty object, schema=array, output=array of single row with null") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index f482ad921ee3..308c56f40548 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -385,16 +385,7 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite].exclude( "SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") enableSuite[GlutenJsonExpressionsSuite] - .exclude("$.store.book[*]") - .exclude("$.store.book[*].category") - .exclude("$.store.book[*].isbn") - .exclude("$.store.basket[*]") - .exclude("$.store.basket[*][0]") - .exclude("$.store.basket[0][*]") - .exclude("$.store.basket[*][*]") .exclude("$.store.basket[0][*].b") - .exclude("$.zip code") - .exclude("$.fb:testid") .exclude("from_json - invalid data") .exclude("from_json - input=object, schema=array, output=array of single row") .exclude("from_json - input=empty object, schema=array, output=array of single row with null") From c401de016019ee5da306bed403aee0ea16fd8a07 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Wed, 15 Jan 2025 10:30:48 +0800 Subject: [PATCH 3/4] fix ut --- .../apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 9fedfe940db4..00738ae61169 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -403,6 +403,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("parse date with locale") .exclude("parse decimals using locale") // NOT use gluten + .exclude("$.store.basket[0][*].b") // issue: https://github.com/apache/incubator-gluten/issues/8529 .exclude("$..no_recursive") .exclude("non foldable literal") .exclude("some big value") From 1690d8ac589255ad4d1941cd74c5629db39b2f6d Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Wed, 15 Jan 2025 15:37:36 +0800 Subject: [PATCH 4/4] fix ci --- .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 00738ae61169..b979651ae2bf 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -385,6 +385,9 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenJoinSuite].exclude( "SPARK-36794: Ignore duplicated key when building relation for semi/anti hash join") enableSuite[GlutenJsonExpressionsSuite] + .exclude( + "$.store.basket[0][*].b" + ) // issue: https://github.com/apache/incubator-gluten/issues/8529 .exclude("from_json - invalid data") .exclude("from_json - input=object, schema=array, output=array of single row") .exclude("from_json - input=empty object, schema=array, output=array of single row with null") @@ -403,7 +406,6 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("parse date with locale") .exclude("parse decimals using locale") // NOT use gluten - .exclude("$.store.basket[0][*].b") // issue: https://github.com/apache/incubator-gluten/issues/8529 .exclude("$..no_recursive") .exclude("non foldable literal") .exclude("some big value")