diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 29c1596ed8f7b9..ac4d64bc62fa68 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include // IWYU pragma: keep #include @@ -254,13 +255,17 @@ Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, const std::vector& jsonpath, simdjson::ondemand::value* value) noexcept { // Return DataQualityError when it's a malformed json. -// Otherwise the path was not found, due to array out of bound or not exist +// Otherwise the path was not found, due to +// 1. array out of bound +// 2. not exist such field in object +// 3. the input type is not object but could be null or other types and lead to simdjson::INCORRECT_TYPE #define HANDLE_SIMDJSON_ERROR(err, msg) \ do { \ const simdjson::error_code& _err = err; \ const std::string& _msg = msg; \ if (UNLIKELY(_err)) { \ - if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS) { \ + if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS || \ + _err == simdjson::INCORRECT_TYPE) { \ return Status::NotFound( \ fmt::format("Not found target filed, err: {}, msg: {}", \ simdjson::error_message(_err), _msg)); \ @@ -348,4 +353,9 @@ void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value } } +// root path "$." +bool JsonFunctions::is_root_path(const std::vector& json_path) { + return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty(); +} + } // namespace doris diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h index 72aa522ff374fa..11970eb8c46c56 100644 --- a/be/src/exprs/json_functions.h +++ b/be/src/exprs/json_functions.h @@ -116,6 +116,8 @@ class JsonFunctions { static std::string print_json_value(const rapidjson::Value& value); + static bool is_root_path(const std::vector& json_path); + private: static rapidjson::Value* match_value(const std::vector& parsed_paths, rapidjson::Value* document, diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 9872b1c3150837..40dc6dda5f997d 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -1659,7 +1659,19 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( return st; } } - if (i >= _parsed_jsonpaths.size() || st.is()) { + if (i < _parsed_jsonpaths.size() && JsonFunctions::is_root_path(_parsed_jsonpaths[i])) { + // Indicate that the jsonpath is "$.", read the full root json object, insert the original doc directly + ColumnNullable* nullable_column = nullptr; + IColumn* target_column_ptr = nullptr; + if (slot_desc->is_nullable()) { + nullable_column = assert_cast(column_ptr); + target_column_ptr = &nullable_column->get_nested_column(); + } + auto* column_string = assert_cast(target_column_ptr); + column_string->insert_data(_simdjson_ondemand_padding_buffer.data(), + _original_doc_size); + has_valid_value = true; + } else if (i >= _parsed_jsonpaths.size() || st.is()) { // not match in jsondata, filling with default value RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); if (!(*valid)) { diff --git a/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json b/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json new file mode 100644 index 00000000000000..945b4143022892 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_json_extract_path_invalid_type.json @@ -0,0 +1,13 @@ +[ + { + "id": 789, + "city": { + "name": "beijing", + "region": "haidian" + } + }, + { + "id": 1111, + "city": null + } +] \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out index 588b6edb00463a..1d6777bb21e7ab 100644 --- a/regression-test/data/load_p0/stream_load/test_json_load.out +++ b/regression-test/data/load_p0/stream_load/test_json_load.out @@ -250,3 +250,13 @@ test k2_value -- !select29 -- 10 \N + +-- !select30 -- +12345 {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} 111111 +12346 {"k1":12346,"k2":"22222","k4":[22222]} {"k1":12346,"k2":"22222","k4":[22222]} \N +12347 {"k1":12347,"k3":"33333","k4":[22222]} {"k1":12347,"k3":"33333","k4":[22222]} 33333 +12348 {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} 33333 + +-- !select31 -- +789 beijing haidian +1111 \N \N \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_read_root_path.json b/regression-test/data/load_p0/stream_load/test_read_root_path.json new file mode 100644 index 00000000000000..777ccbbfb1f933 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_read_root_path.json @@ -0,0 +1,4 @@ +{"k1" : 12345, "k2" : "11111", "k3" : 111111, "k4" : [11111]} +{"k1" : 12346, "k2" : "22222", "k4" : [22222]} +{"k1" : 12347, "k3" : "33333", "k4" : [22222]} +{"k1" : 12348, "k3" : "33333", "k5" : {"k51" : 1024, "xxxx" : [11111]}} \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy index 7f182a44b74204..8b8e1417bcf494 100644 --- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy @@ -878,4 +878,58 @@ suite("test_json_load", "p0") { } finally { try_sql("DROP TABLE IF EXISTS ${testTable}") } + + // support read "$." as root + try { + sql "DROP TABLE IF EXISTS ${testTable}" + sql """CREATE TABLE IF NOT EXISTS ${testTable} + ( + `k1` varchar(1024) NULL, + `k2` variant NULL, + `k3` variant NULL, + `k4` variant NULL + ) + DUPLICATE KEY(`k1`) + COMMENT '' + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + );""" + + load_json_data.call("${testTable}", "${testTable}_case30", 'false', 'true', 'json', '', '[\"$.k1\",\"$.\", \"$.\", \"$.k3\"]', + '', '', '', 'test_read_root_path.json') + + sql "sync" + qt_select30 "select * from ${testTable} order by k1" + + } finally { + // try_sql("DROP TABLE IF EXISTS ${testTable}") + } + + // test extract json path with invalid type(none object types like null) + try { + sql "DROP TABLE IF EXISTS ${testTable}" + sql """ + CREATE TABLE ${testTable} ( + `id` int NOT NULL, + `name` varchar(24) NULL, + `region` varchar(30) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT '' + DISTRIBUTED BY RANDOM BUCKETS AUTO + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + load_json_data.call("${testTable}", "${testTable}_case31", 'true', 'false', 'json', '', '[\"$.id\", \"$.city.name\", \"$.city.region\"]', + '', '', '', 'test_json_extract_path_invalid_type.json', false, 2) + + sql "sync" + qt_select31 "select * from ${testTable} order by id" + + } finally { + // try_sql("DROP TABLE IF EXISTS ${testTable}") + } }