From d1cfd0cb755e7d42eb5be129a3b211ef034046a0 Mon Sep 17 00:00:00 2001 From: zhangstar333 <2561612514@qq.com> Date: Thu, 27 Jun 2024 16:26:32 +0800 Subject: [PATCH 1/5] [test](fucntion) --- be/src/vec/functions/function_json.cpp | 112 +++++++++++++++++++------ 1 file changed, 88 insertions(+), 24 deletions(-) diff --git a/be/src/vec/functions/function_json.cpp b/be/src/vec/functions/function_json.cpp index e7c2fc1781dfc5..60a9d97d740c76 100644 --- a/be/src/vec/functions/function_json.cpp +++ b/be/src/vec/functions/function_json.cpp @@ -831,48 +831,107 @@ struct FunctionJsonExtractImpl { rapidjson::Value value; rapidjson::Document document; - const auto obj = json_col->get_data_at(row); + const auto& obj = json_col->get_data_at(row); std::string_view json_string(obj.data, obj.size); - const auto path = path_col->get_data_at(row); + const auto& path = path_col->get_data_at(row); std::string_view path_string(path.data, path.size); - - auto root = get_json_object(json_string, path_string, &document); + auto* root = get_json_object(json_string, path_string, &document); if (root != nullptr) { value.CopyFrom(*root, allocator); } return value; } + static rapidjson::Value* get_document(const std::vector& data_columns, + rapidjson::Document* document, + std::vector& parsed_paths) { + const auto& path = data_columns[1]->get_data_at(0); + std::string_view path_string(path.data, path.size); + //Cannot use '\' as the last character, return NULL + if (path_string.back() == '\\') { + document->SetNull(); + return nullptr; + } + +#ifdef USE_LIBCPP + std::string s(path_string); + auto tok = get_json_token(s); +#else + auto tok = get_json_token(path_string); +#endif + std::vector paths(tok.begin(), tok.end()); + get_parsed_paths(paths, &parsed_paths); + if (parsed_paths.empty()) { + return nullptr; + } + if (!(parsed_paths)[0].is_valid) { + return nullptr; + } + return document; + } + static void execute(const std::vector& data_columns, - ColumnString& result_column, NullMap& null_map, size_t input_rows_count) { + ColumnString& result_column, NullMap& null_map, size_t input_rows_count, + std::vector& column_is_consts) { rapidjson::Document document; rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); rapidjson::StringBuffer buf; rapidjson::Writer writer(buf); - - const auto json_col = data_columns[0]; - for (size_t row = 0; row < input_rows_count; row++) { + const auto* json_col = data_columns[0]; + if (data_columns.size() == 2 && column_is_consts[1]) { rapidjson::Value value; - if (data_columns.size() == 2) { - value = parse_json(json_col, data_columns[1], allocator, row); - } else { + std::vector parsed_paths; + auto* root = get_document(data_columns, &document, parsed_paths); + for (size_t row = 0; row < input_rows_count; row++) { + if (root != nullptr) { + const auto& obj = json_col->get_data_at(row); + std::string_view json_string(obj.data, obj.size); + if (UNLIKELY((parsed_paths).size() == 1)) { + document.SetString(json_string.data(), json_string.size(), allocator); + } + + document.Parse(json_string.data(), json_string.size()); + if (UNLIKELY(document.HasParseError())) { + null_map[row] = 1; + result_column.insert_default(); + continue; + } + auto* root_val = match_value(parsed_paths, &document, allocator); + if (root_val != nullptr) { + value.CopyFrom(*root_val, allocator); + } + } + if (value.IsNull()) { + null_map[row] = 1; + result_column.insert_default(); + } else { + // write value as string + buf.Clear(); + writer.Reset(buf); + value.Accept(writer); + result_column.insert_data(buf.GetString(), buf.GetSize()); + } + } + } else { + for (size_t row = 0; row < input_rows_count; row++) { + rapidjson::Value value; value.SetArray(); value.Reserve(data_columns.size() - 1, allocator); for (size_t col = 1; col < data_columns.size(); ++col) { value.PushBack(parse_json(json_col, data_columns[col], allocator, row), allocator); } - } - if (value.IsNull()) { - null_map[row] = 1; - result_column.insert_default(); - } else { - // write value as string - buf.Clear(); - writer.Reset(buf); - value.Accept(writer); - result_column.insert_data(buf.GetString(), buf.GetSize()); + if (value.IsNull()) { + null_map[row] = 1; + result_column.insert_default(); + } else { + // write value as string + buf.Clear(); + writer.Reset(buf); + value.Accept(writer); + result_column.insert_data(buf.GetString(), buf.GetSize()); + } } } } @@ -931,13 +990,18 @@ class FunctionJsonNullable : public IFunction { auto null_map = ColumnUInt8::create(input_rows_count, 0); std::vector column_ptrs; // prevent converted column destruct std::vector data_columns; + std::vector column_is_consts; for (int i = 0; i < arguments.size(); i++) { - column_ptrs.push_back( - block.get_by_position(arguments[i]).column->convert_to_full_column_if_const()); + ColumnPtr arg_col; + bool arg_const; + std::tie(arg_col, arg_const) = + unpack_if_const(block.get_by_position(arguments[i]).column); + column_is_consts.push_back(arg_const); + column_ptrs.push_back(arg_col); data_columns.push_back(assert_cast(column_ptrs.back().get())); } Impl::execute(data_columns, *assert_cast(result_column.get()), - null_map->get_data(), input_rows_count); + null_map->get_data(), input_rows_count, column_is_consts); block.replace_by_position( result, ColumnNullable::create(std::move(result_column), std::move(null_map))); return Status::OK(); From 02a3f4f65dbf3de24bf38b0d600bbd9772613adf Mon Sep 17 00:00:00 2001 From: zhangstar333 <2561612514@qq.com> Date: Thu, 27 Jun 2024 21:08:05 +0800 Subject: [PATCH 2/5] update --- be/src/vec/functions/function_json.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/be/src/vec/functions/function_json.cpp b/be/src/vec/functions/function_json.cpp index 60a9d97d740c76..2f1e2ef92fa9a4 100644 --- a/be/src/vec/functions/function_json.cpp +++ b/be/src/vec/functions/function_json.cpp @@ -826,14 +826,14 @@ struct FunctionJsonExtractImpl { static constexpr auto name = "json_extract"; static rapidjson::Value parse_json(const ColumnString* json_col, const ColumnString* path_col, - rapidjson::Document::AllocatorType& allocator, - const int row) { + rapidjson::Document::AllocatorType& allocator, const int row, + const int col, std::vector& column_is_consts) { rapidjson::Value value; rapidjson::Document document; - const auto& obj = json_col->get_data_at(row); + const auto& obj = json_col->get_data_at(index_check_const(row, column_is_consts[0])); std::string_view json_string(obj.data, obj.size); - const auto& path = path_col->get_data_at(row); + const auto& path = path_col->get_data_at(index_check_const(row, column_is_consts[col])); std::string_view path_string(path.data, path.size); auto* root = get_json_object(json_string, path_string, &document); if (root != nullptr) { @@ -918,7 +918,8 @@ struct FunctionJsonExtractImpl { value.SetArray(); value.Reserve(data_columns.size() - 1, allocator); for (size_t col = 1; col < data_columns.size(); ++col) { - value.PushBack(parse_json(json_col, data_columns[col], allocator, row), + value.PushBack(parse_json(json_col, data_columns[col], allocator, row, col, + column_is_consts), allocator); } From 85ba0884ab187259591c172f999ed2faba4d4391 Mon Sep 17 00:00:00 2001 From: zhangstar333 <2561612514@qq.com> Date: Fri, 28 Jun 2024 12:00:03 +0800 Subject: [PATCH 3/5] update --- be/src/vec/functions/function_json.cpp | 96 ++++++++++++++------------ 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/be/src/vec/functions/function_json.cpp b/be/src/vec/functions/function_json.cpp index 2f1e2ef92fa9a4..58a81fd267ff27 100644 --- a/be/src/vec/functions/function_json.cpp +++ b/be/src/vec/functions/function_json.cpp @@ -842,10 +842,11 @@ struct FunctionJsonExtractImpl { return value; } - static rapidjson::Value* get_document(const std::vector& data_columns, + static rapidjson::Value* get_document(const ColumnString* path_col, rapidjson::Document* document, - std::vector& parsed_paths) { - const auto& path = data_columns[1]->get_data_at(0); + std::vector& parsed_paths, const int row, + bool is_const_column) { + const auto& path = path_col->get_data_at(index_check_const(row, is_const_column)); std::string_view path_string(path.data, path.size); //Cannot use '\' as the last character, return NULL if (path_string.back() == '\\') { @@ -878,61 +879,64 @@ struct FunctionJsonExtractImpl { rapidjson::StringBuffer buf; rapidjson::Writer writer(buf); const auto* json_col = data_columns[0]; - if (data_columns.size() == 2 && column_is_consts[1]) { + auto insert_result_lambda = [&](rapidjson::Value& value, int row) { + if (value.IsNull()) { + null_map[row] = 1; + result_column.insert_default(); + } else { + // write value as string + buf.Clear(); + writer.Reset(buf); + value.Accept(writer); + result_column.insert_data(buf.GetString(), buf.GetSize()); + } + }; + if (data_columns.size() == 2) { rapidjson::Value value; - std::vector parsed_paths; - auto* root = get_document(data_columns, &document, parsed_paths); - for (size_t row = 0; row < input_rows_count; row++) { - if (root != nullptr) { - const auto& obj = json_col->get_data_at(row); - std::string_view json_string(obj.data, obj.size); - if (UNLIKELY((parsed_paths).size() == 1)) { - document.SetString(json_string.data(), json_string.size(), allocator); - } - - document.Parse(json_string.data(), json_string.size()); - if (UNLIKELY(document.HasParseError())) { - null_map[row] = 1; - result_column.insert_default(); - continue; - } - auto* root_val = match_value(parsed_paths, &document, allocator); - if (root_val != nullptr) { - value.CopyFrom(*root_val, allocator); + if (column_is_consts[1]) { + std::vector parsed_paths; + auto* root = get_document(data_columns[1], &document, parsed_paths, 0, + column_is_consts[1]); + for (size_t row = 0; row < input_rows_count; row++) { + if (root != nullptr) { + const auto& obj = json_col->get_data_at(row); + std::string_view json_string(obj.data, obj.size); + if (UNLIKELY((parsed_paths).size() == 1)) { + document.SetString(json_string.data(), json_string.size(), allocator); + } + document.Parse(json_string.data(), json_string.size()); + if (UNLIKELY(document.HasParseError())) { + null_map[row] = 1; + result_column.insert_default(); + continue; + } + auto* root_val = match_value(parsed_paths, &document, allocator); + if (root_val != nullptr) { + value.CopyFrom(*root_val, allocator); + } } + insert_result_lambda(value, row); } - if (value.IsNull()) { - null_map[row] = 1; - result_column.insert_default(); - } else { - // write value as string - buf.Clear(); - writer.Reset(buf); - value.Accept(writer); - result_column.insert_data(buf.GetString(), buf.GetSize()); + } else { + for (size_t row = 0; row < input_rows_count; row++) { + value = parse_json(json_col, data_columns[1], allocator, row, 1, + column_is_consts); + insert_result_lambda(value, row); } } + } else { + rapidjson::Value value; + value.SetArray(); + value.Reserve(data_columns.size() - 1, allocator); for (size_t row = 0; row < input_rows_count; row++) { - rapidjson::Value value; - value.SetArray(); - value.Reserve(data_columns.size() - 1, allocator); + value.Clear(); for (size_t col = 1; col < data_columns.size(); ++col) { value.PushBack(parse_json(json_col, data_columns[col], allocator, row, col, column_is_consts), allocator); } - - if (value.IsNull()) { - null_map[row] = 1; - result_column.insert_default(); - } else { - // write value as string - buf.Clear(); - writer.Reset(buf); - value.Accept(writer); - result_column.insert_data(buf.GetString(), buf.GetSize()); - } + insert_result_lambda(value, row); } } } From 94084bf2cc7fe1aa5c22a33c845c5ac73ab79512 Mon Sep 17 00:00:00 2001 From: zhangstar333 <2561612514@qq.com> Date: Fri, 28 Jun 2024 15:48:55 +0800 Subject: [PATCH 4/5] update about clear value --- be/src/vec/functions/function_json.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/be/src/vec/functions/function_json.cpp b/be/src/vec/functions/function_json.cpp index 58a81fd267ff27..f2aecf21fe618e 100644 --- a/be/src/vec/functions/function_json.cpp +++ b/be/src/vec/functions/function_json.cpp @@ -913,6 +913,9 @@ struct FunctionJsonExtractImpl { auto* root_val = match_value(parsed_paths, &document, allocator); if (root_val != nullptr) { value.CopyFrom(*root_val, allocator); + } else { + rapidjson::Value tmp; + value.Swap(tmp); } } insert_result_lambda(value, row); @@ -924,7 +927,7 @@ struct FunctionJsonExtractImpl { insert_result_lambda(value, row); } } - + } else { rapidjson::Value value; value.SetArray(); @@ -993,7 +996,6 @@ class FunctionJsonNullable : public IFunction { size_t result, size_t input_rows_count) const override { auto result_column = ColumnString::create(); auto null_map = ColumnUInt8::create(input_rows_count, 0); - std::vector column_ptrs; // prevent converted column destruct std::vector data_columns; std::vector column_is_consts; for (int i = 0; i < arguments.size(); i++) { @@ -1002,8 +1004,7 @@ class FunctionJsonNullable : public IFunction { std::tie(arg_col, arg_const) = unpack_if_const(block.get_by_position(arguments[i]).column); column_is_consts.push_back(arg_const); - column_ptrs.push_back(arg_col); - data_columns.push_back(assert_cast(column_ptrs.back().get())); + data_columns.push_back(assert_cast(arg_col.get())); } Impl::execute(data_columns, *assert_cast(result_column.get()), null_map->get_data(), input_rows_count, column_is_consts); From 55cb5d57a9144e5d3c1f7035ecc5ea9b8b997700 Mon Sep 17 00:00:00 2001 From: zhangstar333 <2561612514@qq.com> Date: Mon, 1 Jul 2024 16:25:10 +0800 Subject: [PATCH 5/5] update review --- be/src/vec/functions/function_json.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/be/src/vec/functions/function_json.cpp b/be/src/vec/functions/function_json.cpp index f2aecf21fe618e..2faeb24d514c16 100644 --- a/be/src/vec/functions/function_json.cpp +++ b/be/src/vec/functions/function_json.cpp @@ -831,9 +831,9 @@ struct FunctionJsonExtractImpl { rapidjson::Value value; rapidjson::Document document; - const auto& obj = json_col->get_data_at(index_check_const(row, column_is_consts[0])); + const auto obj = json_col->get_data_at(index_check_const(row, column_is_consts[0])); std::string_view json_string(obj.data, obj.size); - const auto& path = path_col->get_data_at(index_check_const(row, column_is_consts[col])); + const auto path = path_col->get_data_at(index_check_const(row, column_is_consts[col])); std::string_view path_string(path.data, path.size); auto* root = get_json_object(json_string, path_string, &document); if (root != nullptr) { @@ -846,7 +846,7 @@ struct FunctionJsonExtractImpl { rapidjson::Document* document, std::vector& parsed_paths, const int row, bool is_const_column) { - const auto& path = path_col->get_data_at(index_check_const(row, is_const_column)); + const auto path = path_col->get_data_at(index_check_const(row, is_const_column)); std::string_view path_string(path.data, path.size); //Cannot use '\' as the last character, return NULL if (path_string.back() == '\\') { @@ -860,6 +860,7 @@ struct FunctionJsonExtractImpl { #else auto tok = get_json_token(path_string); #endif + // TODO: here maybe could use std::vector or std::span std::vector paths(tok.begin(), tok.end()); get_parsed_paths(paths, &parsed_paths); if (parsed_paths.empty()) {