From 768150320eb709c3b2d03b4ac50ab75f543ab2a1 Mon Sep 17 00:00:00 2001 From: lihangyu <15605149486@163.com> Date: Mon, 29 Jul 2024 20:45:11 +0800 Subject: [PATCH 1/2] [Feature](JsonReader) support sepecify `$.` as root column in json path (#38213) ``` curl --location-trusted -u root: -T value.json -H "read_json_by_line:true" -H "format:json" -H "max_filter_ratio:0.5" -H "jsonpaths: [\"$.id\", \"$.entity_id\", \"$.\"]" http://127.0.0.1:8149/api/regression_test/records/_stream_load ``` --- be/src/exprs/json_functions.cpp | 5 ++++ be/src/exprs/json_functions.h | 2 ++ .../vec/exec/format/json/new_json_reader.cpp | 14 +++++++++- .../load_p0/stream_load/test_json_load.out | 6 +++++ .../stream_load/test_read_root_path.json | 4 +++ .../load_p0/stream_load/test_json_load.groovy | 27 +++++++++++++++++++ 6 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 regression-test/data/load_p0/stream_load/test_read_root_path.json diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 29c1596ed8f7b9..a7602aefc0d49e 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -348,4 +348,9 @@ void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value } } +// root path "$." +bool JsonFunctions::is_root_path(const std::vector& json_path) { + return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty(); +} + } // namespace doris diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h index 72aa522ff374fa..11970eb8c46c56 100644 --- a/be/src/exprs/json_functions.h +++ b/be/src/exprs/json_functions.h @@ -116,6 +116,8 @@ class JsonFunctions { static std::string print_json_value(const rapidjson::Value& value); + static bool is_root_path(const std::vector& json_path); + private: static rapidjson::Value* match_value(const std::vector& parsed_paths, rapidjson::Value* document, diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 9872b1c3150837..40dc6dda5f997d 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -1659,7 +1659,19 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( return st; } } - if (i >= _parsed_jsonpaths.size() || st.is()) { + if (i < _parsed_jsonpaths.size() && JsonFunctions::is_root_path(_parsed_jsonpaths[i])) { + // Indicate that the jsonpath is "$.", read the full root json object, insert the original doc directly + ColumnNullable* nullable_column = nullptr; + IColumn* target_column_ptr = nullptr; + if (slot_desc->is_nullable()) { + nullable_column = assert_cast(column_ptr); + target_column_ptr = &nullable_column->get_nested_column(); + } + auto* column_string = assert_cast(target_column_ptr); + column_string->insert_data(_simdjson_ondemand_padding_buffer.data(), + _original_doc_size); + has_valid_value = true; + } else if (i >= _parsed_jsonpaths.size() || st.is()) { // not match in jsondata, filling with default value RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); if (!(*valid)) { diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out index 588b6edb00463a..7df15b74b86f62 100644 --- a/regression-test/data/load_p0/stream_load/test_json_load.out +++ b/regression-test/data/load_p0/stream_load/test_json_load.out @@ -250,3 +250,9 @@ test k2_value -- !select29 -- 10 \N + +-- !select30 -- +12345 {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} 111111 +12346 {"k1":12346,"k2":"22222","k4":[22222]} {"k1":12346,"k2":"22222","k4":[22222]} \N +12347 {"k1":12347,"k3":"33333","k4":[22222]} {"k1":12347,"k3":"33333","k4":[22222]} 33333 +12348 {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} 33333 \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_read_root_path.json b/regression-test/data/load_p0/stream_load/test_read_root_path.json new file mode 100644 index 00000000000000..777ccbbfb1f933 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_read_root_path.json @@ -0,0 +1,4 @@ +{"k1" : 12345, "k2" : "11111", "k3" : 111111, "k4" : [11111]} +{"k1" : 12346, "k2" : "22222", "k4" : [22222]} +{"k1" : 12347, "k3" : "33333", "k4" : [22222]} +{"k1" : 12348, "k3" : "33333", "k5" : {"k51" : 1024, "xxxx" : [11111]}} \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy index 7f182a44b74204..f0655678edb3f4 100644 --- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy @@ -878,4 +878,31 @@ suite("test_json_load", "p0") { } finally { try_sql("DROP TABLE IF EXISTS ${testTable}") } + + // support read "$." as root + try { + sql "DROP TABLE IF EXISTS ${testTable}" + sql """CREATE TABLE IF NOT EXISTS ${testTable} + ( + `k1` varchar(1024) NULL, + `k2` variant NULL, + `k3` variant NULL, + `k4` variant NULL + ) + DUPLICATE KEY(`k1`) + COMMENT '' + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + );""" + + load_json_data.call("${testTable}", "${testTable}_case30", 'false', 'true', 'json', '', '[\"$.k1\",\"$.\", \"$.\", \"$.k3\"]', + '', '', '', 'test_read_root_path.json') + + sql "sync" + qt_select30 "select * from ${testTable} order by k1" + + } finally { + // try_sql("DROP TABLE IF EXISTS ${testTable}") + } } From 775afb61d33ae1433368ca0477d828369ab70ed5 Mon Sep 17 00:00:00 2001 From: lihangyu <15605149486@163.com> Date: Mon, 12 Aug 2024 22:05:22 +0800 Subject: [PATCH 2/2] [Fix](JsonReader) Return correct status when parse failed (#39206) When using `JsonFunctions::extract_from_object`, but input obj is not object type but other types like null, then Status should be `simdjson::INCORRECT_TYPE` to fill the default value in the later process. Example, json path is `$.city.name`, but input json is `{"city" : null}` then `Status::NotFound` should be returned, in the following, column will be filled with default values. --- be/src/exprs/json_functions.cpp | 9 +++++-- .../test_json_extract_path_invalid_type.json | 13 +++++++++ .../load_p0/stream_load/test_json_load.out | 6 ++++- .../load_p0/stream_load/test_json_load.groovy | 27 +++++++++++++++++++ 4 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index a7602aefc0d49e..ac4d64bc62fa68 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include // IWYU pragma: keep #include @@ -254,13 +255,17 @@ Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, const std::vector& jsonpath, simdjson::ondemand::value* value) noexcept { // Return DataQualityError when it's a malformed json. -// Otherwise the path was not found, due to array out of bound or not exist +// Otherwise the path was not found, due to +// 1. array out of bound +// 2. not exist such field in object +// 3. the input type is not object but could be null or other types and lead to simdjson::INCORRECT_TYPE #define HANDLE_SIMDJSON_ERROR(err, msg) \ do { \ const simdjson::error_code& _err = err; \ const std::string& _msg = msg; \ if (UNLIKELY(_err)) { \ - if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS) { \ + if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS || \ + _err == simdjson::INCORRECT_TYPE) { \ return Status::NotFound( \ fmt::format("Not found target filed, err: {}, msg: {}", \ simdjson::error_message(_err), _msg)); \ diff --git a/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json b/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json new file mode 100644 index 00000000000000..945b4143022892 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json @@ -0,0 +1,13 @@ +[ + { + "id": 789, + "city": { + "name": "beijing", + "region": "haidian" + } + }, + { + "id": 1111, + "city": null + } +] \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out index 7df15b74b86f62..1d6777bb21e7ab 100644 --- a/regression-test/data/load_p0/stream_load/test_json_load.out +++ b/regression-test/data/load_p0/stream_load/test_json_load.out @@ -255,4 +255,8 @@ test k2_value 12345 {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} 111111 12346 {"k1":12346,"k2":"22222","k4":[22222]} {"k1":12346,"k2":"22222","k4":[22222]} \N 12347 {"k1":12347,"k3":"33333","k4":[22222]} {"k1":12347,"k3":"33333","k4":[22222]} 33333 -12348 {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} 33333 \ No newline at end of file +12348 {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} 33333 + +-- !select31 -- +789 beijing haidian +1111 \N \N \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy index f0655678edb3f4..8b8e1417bcf494 100644 --- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy @@ -905,4 +905,31 @@ suite("test_json_load", "p0") { } finally { // try_sql("DROP TABLE IF EXISTS ${testTable}") } + + // test extract json path with invalid type(none object types like null) + try { + sql "DROP TABLE IF EXISTS ${testTable}" + sql """ + CREATE TABLE ${testTable} ( + `id` int NOT NULL, + `name` varchar(24) NULL, + `region` varchar(30) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT '' + DISTRIBUTED BY RANDOM BUCKETS AUTO + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + load_json_data.call("${testTable}", "${testTable}_case31", 'true', 'false', 'json', '', '[\"$.id\", \"$.city.name\", \"$.city.region\"]', + '', '', '', 'test_json_extract_path_invalid_type.json', false, 2) + + sql "sync" + qt_select31 "select * from ${testTable} order by id" + + } finally { + // try_sql("DROP TABLE IF EXISTS ${testTable}") + } }