From 191a2d22d706ca81470798a087eebc2c5c3a7910 Mon Sep 17 00:00:00 2001 From: amorynan Date: Tue, 4 Mar 2025 17:11:04 +0800 Subject: [PATCH 1/6] fix invalid jsonb value write into segment file which make select core --- be/src/vec/olap/olap_data_convertor.cpp | 21 +- be/src/vec/olap/olap_data_convertor.h | 4 +- be/test/vec/olap/jsonb_value_test.cpp | 242 ++++++++++++++++++++++++ 3 files changed, 262 insertions(+), 5 deletions(-) create mode 100644 be/test/vec/olap/jsonb_value_test.cpp diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index 64fa885780a5c5..fffd7b24982795 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -179,7 +179,7 @@ OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& co return std::make_unique>(); } case FieldType::OLAP_FIELD_TYPE_JSONB: { - return std::make_unique(true); + return std::make_unique(true, true); } case FieldType::OLAP_FIELD_TYPE_BOOL: { return std::make_unique>(); @@ -233,7 +233,10 @@ OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& co void OlapBlockDataConvertor::set_source_content(const vectorized::Block* block, size_t row_pos, size_t num_rows) { DCHECK(block && num_rows > 0 && row_pos + num_rows <= block->rows() && - block->columns() == _convertors.size()); + block->columns() == _convertors.size()) + << "block=" << block->dump_structure() << ", block rows=" << block->rows() + << ", row_pos=" << row_pos << ", num_rows=" << num_rows + << ", convertors.size=" << _convertors.size(); size_t cid = 0; for (const auto& typed_column : *block) { if (typed_column.column->size() != block->rows()) { @@ -629,8 +632,8 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { // class OlapBlockDataConvertor::OlapColumnDataConvertorVarChar OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::OlapColumnDataConvertorVarChar( - bool check_length) - : _check_length(check_length) {} + bool check_length, bool is_jsonb) + : _check_length(check_length), _is_jsonb(is_jsonb) {} void OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { @@ -674,6 +677,11 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap( "Not support string len over than " "`string_type_length_soft_limit_bytes` in vec engine."); } + // Make sure that the json binary data written in is the correct jsonb value. + if (_is_jsonb && !doris::JsonbDocument::createDocument(slice->data, slice->size)) { + return Status::InvalidArgument("invalid json binary value: {}", + std::string_view(slice->data, slice->size)); + } } else { // TODO: this may not be necessary, check and remove later slice->data = nullptr; @@ -695,6 +703,11 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap( "Not support string len over than `string_type_length_soft_limit_bytes`" " in vec engine."); } + // Make sure that the json binary data written in is the correct jsonb value. + if (_is_jsonb && !doris::JsonbDocument::createDocument(slice->data, slice->size)) { + return Status::InvalidArgument("invalid json binary value: {}", + std::string_view(slice->data, slice->size)); + } string_offset = *offset_cur; ++slice; ++offset_cur; diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index 75aff7dfec34cd..a1b0abee518be9 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -204,7 +204,7 @@ class OlapBlockDataConvertor { class OlapColumnDataConvertorVarChar : public OlapColumnDataConvertorBase { public: - OlapColumnDataConvertorVarChar(bool check_length); + OlapColumnDataConvertorVarChar(bool check_length, bool is_jsonb = false); ~OlapColumnDataConvertorVarChar() override = default; void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, @@ -216,6 +216,8 @@ class OlapBlockDataConvertor { private: bool _check_length; + bool _is_jsonb = + false; // Make sure that the json binary data written in is the correct jsonb value. PaddedPODArray _slice; }; diff --git a/be/test/vec/olap/jsonb_value_test.cpp b/be/test/vec/olap/jsonb_value_test.cpp new file mode 100644 index 00000000000000..3111163c0beede --- /dev/null +++ b/be/test/vec/olap/jsonb_value_test.cpp @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "gtest/gtest_pred_impl.h" +#include "vec/columns/column_string.h" +#include "vec/common/string_ref.h" +#include "vec/core/columns_with_type_and_name.h" +#include "vec/data_types/serde/data_type_serde.h" +#include "vec/olap/olap_data_convertor.h" + +namespace doris::vectorized { + +TEST(JsonbValueConvertorTest, JsonbValueValid) { + // 1. create jsonb column with serde + auto input = ColumnString::create(); + auto dataTypeJsonb = std::make_shared(); + auto serde = dataTypeJsonb->get_serde(); + vectorized::DataTypeSerDe::FormatOptions options; + + // Test case 1 + std::string str1 = "{\"key1\": \"value1\"}"; + Slice slice1 = Slice(str1.data(), str1.length()); + auto st1 = serde->deserialize_one_cell_from_json(*input, slice1, options); + ASSERT_TRUE(st1.ok()); + ASSERT_EQ(input->size(), 1); + + // Test case 2 + std::string str2 = "{\"key2\": 12345}"; + Slice slice2 = Slice(str2.data(), str2.length()); + auto st2 = serde->deserialize_one_cell_from_json(*input, slice2, options); + ASSERT_TRUE(st2.ok()); + ASSERT_EQ(input->size(), 2); + + // Test case 3 + std::string str3 = "{\"key3\": true}"; + Slice slice3 = Slice(str3.data(), str3.length()); + auto st3 = serde->deserialize_one_cell_from_json(*input, slice3, options); + ASSERT_TRUE(st3.ok()); + ASSERT_EQ(input->size(), 3); + + // Test case 4 + std::string str4 = "{\"key4\": [1, 2, 3]}"; + Slice slice4 = Slice(str4.data(), str4.length()); + auto st4 = serde->deserialize_one_cell_from_json(*input, slice4, options); + ASSERT_TRUE(st4.ok()); + ASSERT_EQ(input->size(), 4); + + // Test case 5 + std::string str5 = "{\"key5\": {\"subkey\": \"subvalue\"}}"; + Slice slice5 = Slice(str5.data(), str5.length()); + auto st5 = serde->deserialize_one_cell_from_json(*input, slice5, options); + ASSERT_TRUE(st5.ok()); + ASSERT_EQ(input->size(), 5); + + // 2. put column into block + vectorized::ColumnWithTypeAndName argument(input->assume_mutable(), dataTypeJsonb, + "jsonb_column"); + Block block; + block.insert(argument); + + // 3. use OlapColumnDataConvertorVarchar::convert_to_olap to convert column data to segment file data + auto _olap_data_convertor = std::make_unique(); + TabletColumn jsonb_column = TabletColumn(); + jsonb_column.set_type(FieldType::OLAP_FIELD_TYPE_JSONB); + _olap_data_convertor->add_column_data_convertor(jsonb_column); + _olap_data_convertor->set_source_content(&block, 0, 5); + auto [status, column] = _olap_data_convertor->convert_column_data(0); + ASSERT_TRUE(status.ok()); + ASSERT_NE(column, nullptr); + + // test with null map + auto nullable_col = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); + auto nullable_dataTypeJsonb = make_nullable(std::make_shared()); + auto serde1 = nullable_dataTypeJsonb->get_serde(); + + auto st = serde1->deserialize_one_cell_from_json(*nullable_col, slice1, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(1, nullable_col->size()); + + // insert null + nullable_col->insert_default(); + ASSERT_EQ(2, nullable_col->size()); + + st = serde1->deserialize_one_cell_from_json(*nullable_col, slice2, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(3, nullable_col->size()); + + // deserialize null + Slice slice_null = "NULL"; + st = serde1->deserialize_one_cell_from_json(*nullable_col, slice_null, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(4, nullable_col->size()); + + st = serde1->deserialize_one_cell_from_json(*nullable_col, slice3, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(5, nullable_col->size()); + + // 2. put column into block + vectorized::ColumnWithTypeAndName argument1(nullable_col->assume_mutable(), + nullable_dataTypeJsonb, "jsonb_column_null"); + block.clear(); + block.insert(argument1); + + // 3. use OlapColumnDataConvertorVarchar::convert_to_olap to convert column data to segment file data + _olap_data_convertor->reset(); + _olap_data_convertor->add_column_data_convertor(jsonb_column); + _olap_data_convertor->set_source_content(&block, 0, 5); + auto [status1, column1] = _olap_data_convertor->convert_column_data(0); + ASSERT_TRUE(status1.ok()) << status1.to_string(); + ASSERT_NE(column1, nullptr); +} + +TEST(JsonbValueConvertorTest, JsonbValueInvalid) { + // 1. create jsonb column with serde + auto input = ColumnString::create(); + auto dataTypeJsonb = std::make_shared(); + auto serde = dataTypeJsonb->get_serde(); + vectorized::DataTypeSerDe::FormatOptions options; + + // Test case 1 + std::string str1 = "{\"key1\": \"value1\"}"; + Slice slice1 = Slice(str1.data(), str1.length()); + auto st1 = serde->deserialize_one_cell_from_json(*input, slice1, options); + ASSERT_TRUE(st1.ok()); + ASSERT_EQ(input->size(), 1); + + // Test case 2 + std::string str2 = "{\"key2\": 12345}"; + Slice slice2 = Slice(str2.data(), str2.length()); + auto st2 = serde->deserialize_one_cell_from_json(*input, slice2, options); + ASSERT_TRUE(st2.ok()); + ASSERT_EQ(input->size(), 2); + + // Test case 3 + std::string str3 = "{\"key3\": true}"; + Slice slice3 = Slice(str3.data(), str3.length()); + auto st3 = serde->deserialize_one_cell_from_json(*input, slice3, options); + ASSERT_TRUE(st3.ok()); + ASSERT_EQ(input->size(), 3); + + // Test case 4 + std::string str4 = "{\"key4\": [1, 2, 3]}"; + Slice slice4 = Slice(str4.data(), str4.length()); + auto st4 = serde->deserialize_one_cell_from_json(*input, slice4, options); + ASSERT_TRUE(st4.ok()); + ASSERT_EQ(input->size(), 4); + // invalid jsonb data + auto& data = input->get_chars(); + data.emplace_back('s'); + + // Test case 5 + std::string str5 = "{\"key5\": {\"subkey\": \"subvalue\"}}"; + Slice slice5 = Slice(str5.data(), str5.length()); + auto st5 = serde->deserialize_one_cell_from_json(*input, slice5, options); + ASSERT_TRUE(st5.ok()); + ASSERT_EQ(input->size(), 5); + + // 2. put column into block + vectorized::ColumnWithTypeAndName argument(input->assume_mutable(), dataTypeJsonb, + "jsonb_column"); + Block block; + block.insert(argument); + + // 3. use OlapColumnDataConvertorVarchar::convert_to_olap to convert column data to segment file data + auto _olap_data_convertor = std::make_unique(); + TabletColumn jsonb_column = TabletColumn(); + jsonb_column.set_type(FieldType::OLAP_FIELD_TYPE_JSONB); + _olap_data_convertor->add_column_data_convertor(jsonb_column); + _olap_data_convertor->set_source_content(&block, 0, 5); + auto [status, column] = _olap_data_convertor->convert_column_data(0); + // invalid will make error + ASSERT_FALSE(status.ok()); + ASSERT_TRUE(status.to_string().find("invalid json binary value") != std::string::npos); + ASSERT_NE(column, nullptr); + + // test with null map + auto nullable_col = ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()); + auto nullable_dataTypeJsonb = make_nullable(std::make_shared()); + auto serde1 = nullable_dataTypeJsonb->get_serde(); + + auto st = serde1->deserialize_one_cell_from_json(*nullable_col, slice1, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(1, nullable_col->size()); + + // insert null + nullable_col->insert_default(); + ASSERT_EQ(2, nullable_col->size()); + + st = serde1->deserialize_one_cell_from_json(*nullable_col, slice2, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(3, nullable_col->size()); + // invalid jsonb data + auto string_data = assert_cast(nullable_col->get_nested_column_ptr().get()); + auto& dat = string_data->get_chars(); + dat.emplace_back('s'); + + // deserialize null + Slice slice_null = "NULL"; + st = serde1->deserialize_one_cell_from_json(*nullable_col, slice_null, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(4, nullable_col->size()); + + st = serde1->deserialize_one_cell_from_json(*nullable_col, slice3, options); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(5, nullable_col->size()); + + // 2. put column into block + vectorized::ColumnWithTypeAndName argument1(nullable_col->assume_mutable(), + nullable_dataTypeJsonb, "jsonb_column_null"); + block.clear(); + block.insert(argument1); + + // 3. use OlapColumnDataConvertorVarchar::convert_to_olap to convert column data to segment file data + _olap_data_convertor->reset(); + _olap_data_convertor->add_column_data_convertor(jsonb_column); + _olap_data_convertor->set_source_content(&block, 0, 5); + auto [status1, column1] = _olap_data_convertor->convert_column_data(0); + ASSERT_FALSE(status.ok()); + ASSERT_TRUE(status.to_string().find("invalid json binary value") != std::string::npos); + ASSERT_NE(column, nullptr); +} + +} // namespace doris::vectorized From b0a044ba45c0e04b10abc8fa30cf669b9c8a6e33 Mon Sep 17 00:00:00 2001 From: amorynan Date: Tue, 4 Mar 2025 21:02:59 +0800 Subject: [PATCH 2/6] make function name more readable --- be/src/util/jsonb_document.h | 4 ++-- be/src/util/jsonb_utils.h | 2 +- be/src/util/jsonb_writer.h | 2 +- .../exprs/table_function/vexplode_json_array.cpp | 2 +- .../table_function/vexplode_json_object.cpp | 2 +- be/src/vec/functions/function_cast.h | 4 ++-- be/src/vec/functions/function_jsonb.cpp | 16 ++++++++-------- be/src/vec/jsonb/serialize.cpp | 2 +- be/src/vec/olap/olap_data_convertor.cpp | 11 +++++------ .../data_types/common_data_type_serder_test.h | 2 +- .../data_types/serde/data_type_serde_test.cpp | 4 ++-- 11 files changed, 25 insertions(+), 26 deletions(-) diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h index 909ee70742998e..dbb6fad59b510b 100644 --- a/be/src/util/jsonb_document.h +++ b/be/src/util/jsonb_document.h @@ -177,7 +177,7 @@ class JsonbDocument { static JsonbDocument* makeDocument(char* pb, uint32_t size, const JsonbValue* rval); // create an JsonbDocument object from JSONB packed bytes - static JsonbDocument* createDocument(const char* pb, size_t size); + static JsonbDocument* checkAndCreateDocument(const char* pb, size_t size); // create an JsonbValue from JSONB packed bytes static JsonbValue* createValue(const char* pb, size_t size); @@ -1138,7 +1138,7 @@ inline JsonbDocument* JsonbDocument::makeDocument(char* pb, uint32_t size, const return doc; } -inline JsonbDocument* JsonbDocument::createDocument(const char* pb, size_t size) { +inline JsonbDocument* JsonbDocument::checkAndCreateDocument(const char* pb, size_t size) { if (!pb || size < sizeof(JsonbHeader) + sizeof(JsonbValue)) { return nullptr; } diff --git a/be/src/util/jsonb_utils.h b/be/src/util/jsonb_utils.h index 8ec842ef227dd5..6f9678946c2568 100644 --- a/be/src/util/jsonb_utils.h +++ b/be/src/util/jsonb_utils.h @@ -41,7 +41,7 @@ class JsonbToJson { // get json string const std::string to_json_string(const char* data, size_t size) { - JsonbDocument* pdoc = doris::JsonbDocument::createDocument(data, size); + JsonbDocument* pdoc = doris::JsonbDocument::checkAndCreateDocument(data, size); if (!pdoc) { throw Exception(Status::FatalError("invalid json binary value: {}", std::string_view(data, size))); diff --git a/be/src/util/jsonb_writer.h b/be/src/util/jsonb_writer.h index 52d912d29d3b6d..5d0b2c23e0493c 100644 --- a/be/src/util/jsonb_writer.h +++ b/be/src/util/jsonb_writer.h @@ -479,7 +479,7 @@ class JsonbWriterT { OS_TYPE* getOutput() { return os_; } JsonbDocument* getDocument() { - return JsonbDocument::createDocument(getOutput()->getBuffer(), getOutput()->getSize()); + return JsonbDocument::checkAndCreateDocument(getOutput()->getBuffer(), getOutput()->getSize()); } JsonbValue* getValue() { diff --git a/be/src/vec/exprs/table_function/vexplode_json_array.cpp b/be/src/vec/exprs/table_function/vexplode_json_array.cpp index 673ae0c9c266d3..9b3d7879314afc 100644 --- a/be/src/vec/exprs/table_function/vexplode_json_array.cpp +++ b/be/src/vec/exprs/table_function/vexplode_json_array.cpp @@ -61,7 +61,7 @@ void VExplodeJsonArrayTableFunction::process_row(size_t row_idx) { StringRef text = _text_column->get_data_at(row_idx); if (text.data != nullptr) { if (WhichDataType(_text_datatype).is_json()) { - JsonbDocument* doc = JsonbDocument::createDocument(text.data, text.size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(text.data, text.size); if (doc && doc->getValue() && doc->getValue()->isArray()) { auto* a = (ArrayVal*)doc->getValue(); if (a->numElem() > 0) { diff --git a/be/src/vec/exprs/table_function/vexplode_json_object.cpp b/be/src/vec/exprs/table_function/vexplode_json_object.cpp index 7db4da395aeae2..12af72a690316a 100644 --- a/be/src/vec/exprs/table_function/vexplode_json_object.cpp +++ b/be/src/vec/exprs/table_function/vexplode_json_object.cpp @@ -55,7 +55,7 @@ void VExplodeJsonObjectTableFunction::process_row(size_t row_idx) { StringRef text = _json_object_column->get_data_at(row_idx); if (text.data != nullptr) { - JsonbDocument* doc = JsonbDocument::createDocument(text.data, text.size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(text.data, text.size); if (!doc || !doc->getValue()) [[unlikely]] { // error jsonb, put null into output, cur_size = 0 , we will insert_default return; diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 483e837de5dfd8..ab852b609becd7 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -726,7 +726,7 @@ struct ConvertImplGenericFromJsonb { const bool is_dst_string = is_string_or_fixed_string(data_type_to); for (size_t i = 0; i < size; ++i) { const auto& val = col_from_string->get_data_at(i); - JsonbDocument* doc = JsonbDocument::createDocument(val.data, val.size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(val.data, val.size); if (UNLIKELY(!doc || !doc->getValue())) { (*vec_null_map_to)[i] = 1; col_to->insert_default(); @@ -889,7 +889,7 @@ struct ConvertImplFromJsonb { } // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory - JsonbDocument* doc = JsonbDocument::createDocument(val.data, val.size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(val.data, val.size); if (UNLIKELY(!doc || !doc->getValue())) { null_map[i] = 1; res[i] = 0; diff --git a/be/src/vec/functions/function_jsonb.cpp b/be/src/vec/functions/function_jsonb.cpp index dcae26f3c2f844..29eb1e7e6c4799 100644 --- a/be/src/vec/functions/function_jsonb.cpp +++ b/be/src/vec/functions/function_jsonb.cpp @@ -561,7 +561,7 @@ class FunctionJsonbKeys : public IFunction { continue; } const char* l_raw = reinterpret_cast(&ldata[l_off]); - JsonbDocument* doc = JsonbDocument::createDocument(l_raw, l_size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(l_raw, l_size); if (UNLIKELY(!doc || !doc->getValue())) { dst_arr.clear(); return Status::InvalidArgument("jsonb data is invalid"); @@ -669,7 +669,7 @@ class FunctionJsonbExtractPath : public IFunction { static ALWAYS_INLINE void inner_loop_impl(size_t i, Container& res, const char* l_raw_str, size_t l_str_size, JsonbPath& path) { // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory - JsonbDocument* doc = JsonbDocument::createDocument(l_raw_str, l_str_size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(l_raw_str, l_str_size); if (UNLIKELY(!doc || !doc->getValue())) { return; } @@ -764,7 +764,7 @@ struct JsonbExtractStringImpl { } // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory - JsonbDocument* doc = JsonbDocument::createDocument(l_raw, l_size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(l_raw, l_size); if (UNLIKELY(!doc || !doc->getValue())) { StringOP::push_null_string(i, res_data, res_offsets, null_map); return; @@ -890,7 +890,7 @@ struct JsonbExtractStringImpl { writer->writeStartArray(); // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory - JsonbDocument* doc = JsonbDocument::createDocument(l_raw, l_size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(l_raw, l_size); for (size_t pi = 0; pi < rdata_columns.size(); ++pi) { if (UNLIKELY(!doc || !doc->getValue())) { @@ -1031,7 +1031,7 @@ struct JsonbExtractImpl { } // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory - JsonbDocument* doc = JsonbDocument::createDocument(l_raw_str, l_str_size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(l_raw_str, l_str_size); if (UNLIKELY(!doc || !doc->getValue())) { null_map[i] = 1; res[i] = 0; @@ -1409,7 +1409,7 @@ struct JsonbLengthUtil { } auto jsonb_value = jsonb_data_column->get_data_at(i); // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory - JsonbDocument* doc = JsonbDocument::createDocument(jsonb_value.data, jsonb_value.size); + JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(jsonb_value.data, jsonb_value.size); JsonbValue* value = doc->getValue()->findValue(path, nullptr); if (UNLIKELY(!value)) { null_map->get_data()[i] = 1; @@ -1544,9 +1544,9 @@ struct JsonbContainsUtil { } // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory JsonbDocument* doc1 = - JsonbDocument::createDocument(jsonb_value1.data, jsonb_value1.size); + JsonbDocument::checkAndCreateDocument(jsonb_value1.data, jsonb_value1.size); JsonbDocument* doc2 = - JsonbDocument::createDocument(jsonb_value2.data, jsonb_value2.size); + JsonbDocument::checkAndCreateDocument(jsonb_value2.data, jsonb_value2.size); JsonbValue* value1 = doc1->getValue()->findValue(path, nullptr); JsonbValue* value2 = doc2->getValue(); diff --git a/be/src/vec/jsonb/serialize.cpp b/be/src/vec/jsonb/serialize.cpp index a35d722e01565c..d75d332f40c2d6 100644 --- a/be/src/vec/jsonb/serialize.cpp +++ b/be/src/vec/jsonb/serialize.cpp @@ -91,7 +91,7 @@ void JsonbSerializeUtil::jsonb_to_block(const DataTypeSerDeSPtrs& serdes, const const std::unordered_map& col_id_to_idx, Block& dst, const std::vector& default_values, const std::unordered_set& include_cids) { - auto pdoc = JsonbDocument::createDocument(data, size); + auto pdoc = JsonbDocument::checkAndCreateDocument(data, size); JsonbDocument& doc = *pdoc; size_t num_rows = dst.rows(); size_t filled_columns = 0; diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index fffd7b24982795..c34ca7daa6d304 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -233,10 +233,9 @@ OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& co void OlapBlockDataConvertor::set_source_content(const vectorized::Block* block, size_t row_pos, size_t num_rows) { DCHECK(block && num_rows > 0 && row_pos + num_rows <= block->rows() && - block->columns() == _convertors.size()) - << "block=" << block->dump_structure() << ", block rows=" << block->rows() - << ", row_pos=" << row_pos << ", num_rows=" << num_rows - << ", convertors.size=" << _convertors.size(); + block->columns() == _convertors.size()) << "block=" << block->dump_structure() << ", block rows=" << block->rows() + << ", row_pos=" << row_pos << ", num_rows=" << num_rows + << ", convertors.size=" << _convertors.size(); size_t cid = 0; for (const auto& typed_column : *block) { if (typed_column.column->size() != block->rows()) { @@ -678,7 +677,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap( "`string_type_length_soft_limit_bytes` in vec engine."); } // Make sure that the json binary data written in is the correct jsonb value. - if (_is_jsonb && !doris::JsonbDocument::createDocument(slice->data, slice->size)) { + if (_is_jsonb && !doris::JsonbDocument::checkAndCreateDocument(slice->data, slice->size)) { return Status::InvalidArgument("invalid json binary value: {}", std::string_view(slice->data, slice->size)); } @@ -704,7 +703,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap( " in vec engine."); } // Make sure that the json binary data written in is the correct jsonb value. - if (_is_jsonb && !doris::JsonbDocument::createDocument(slice->data, slice->size)) { + if (_is_jsonb && !doris::JsonbDocument::checkAndCreateDocument(slice->data, slice->size)) { return Status::InvalidArgument("invalid json binary value: {}", std::string_view(slice->data, slice->size)); } diff --git a/be/test/vec/data_types/common_data_type_serder_test.h b/be/test/vec/data_types/common_data_type_serder_test.h index 3c800bd8b99b71..516855fbc2dcb4 100644 --- a/be/test/vec/data_types/common_data_type_serder_test.h +++ b/be/test/vec/data_types/common_data_type_serder_test.h @@ -291,7 +291,7 @@ class CommonDataTypeSerdeTest : public ::testing::Test { EXPECT_EQ(jsonb_column->size(), load_cols[0]->size()); for (size_t r = 0; r < jsonb_column->size(); ++r) { StringRef jsonb_data = jsonb_column->get_data_at(r); - auto pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; size_t cIdx = 0; for (auto it = doc->begin(); it != doc->end(); ++it) { diff --git a/be/test/vec/data_types/serde/data_type_serde_test.cpp b/be/test/vec/data_types/serde/data_type_serde_test.cpp index b653edf3e4ea7a..b2369c13eb7659 100644 --- a/be/test/vec/data_types/serde/data_type_serde_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_test.cpp @@ -240,7 +240,7 @@ TEST(DataTypeSerDeTest, DataTypeRowStoreSerDeTest) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { serde->read_one_cell_from_jsonb(*vec, it->value()); @@ -270,7 +270,7 @@ TEST(DataTypeSerDeTest, DataTypeRowStoreSerDeTest) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { serde->read_one_cell_from_jsonb(*vec, it->value()); From d469aabbef9f35cd37e88025392740ad677f1259 Mon Sep 17 00:00:00 2001 From: amorynan Date: Tue, 4 Mar 2025 21:35:14 +0800 Subject: [PATCH 3/6] format code --- be/src/util/jsonb_writer.h | 3 ++- be/src/vec/functions/function_jsonb.cpp | 3 ++- be/src/vec/olap/olap_data_convertor.cpp | 13 ++++++++----- .../inverted_index/analyzer/icu_anzlyzer_test.cpp | 3 +-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/be/src/util/jsonb_writer.h b/be/src/util/jsonb_writer.h index 5d0b2c23e0493c..9da386aa32e265 100644 --- a/be/src/util/jsonb_writer.h +++ b/be/src/util/jsonb_writer.h @@ -479,7 +479,8 @@ class JsonbWriterT { OS_TYPE* getOutput() { return os_; } JsonbDocument* getDocument() { - return JsonbDocument::checkAndCreateDocument(getOutput()->getBuffer(), getOutput()->getSize()); + return JsonbDocument::checkAndCreateDocument(getOutput()->getBuffer(), + getOutput()->getSize()); } JsonbValue* getValue() { diff --git a/be/src/vec/functions/function_jsonb.cpp b/be/src/vec/functions/function_jsonb.cpp index 29eb1e7e6c4799..a822684a974b7c 100644 --- a/be/src/vec/functions/function_jsonb.cpp +++ b/be/src/vec/functions/function_jsonb.cpp @@ -1409,7 +1409,8 @@ struct JsonbLengthUtil { } auto jsonb_value = jsonb_data_column->get_data_at(i); // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory - JsonbDocument* doc = JsonbDocument::checkAndCreateDocument(jsonb_value.data, jsonb_value.size); + JsonbDocument* doc = + JsonbDocument::checkAndCreateDocument(jsonb_value.data, jsonb_value.size); JsonbValue* value = doc->getValue()->findValue(path, nullptr); if (UNLIKELY(!value)) { null_map->get_data()[i] = 1; diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index c34ca7daa6d304..72dd8a89e75b25 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -233,9 +233,10 @@ OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& co void OlapBlockDataConvertor::set_source_content(const vectorized::Block* block, size_t row_pos, size_t num_rows) { DCHECK(block && num_rows > 0 && row_pos + num_rows <= block->rows() && - block->columns() == _convertors.size()) << "block=" << block->dump_structure() << ", block rows=" << block->rows() - << ", row_pos=" << row_pos << ", num_rows=" << num_rows - << ", convertors.size=" << _convertors.size(); + block->columns() == _convertors.size()) + << "block=" << block->dump_structure() << ", block rows=" << block->rows() + << ", row_pos=" << row_pos << ", num_rows=" << num_rows + << ", convertors.size=" << _convertors.size(); size_t cid = 0; for (const auto& typed_column : *block) { if (typed_column.column->size() != block->rows()) { @@ -677,7 +678,8 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap( "`string_type_length_soft_limit_bytes` in vec engine."); } // Make sure that the json binary data written in is the correct jsonb value. - if (_is_jsonb && !doris::JsonbDocument::checkAndCreateDocument(slice->data, slice->size)) { + if (_is_jsonb && + !doris::JsonbDocument::checkAndCreateDocument(slice->data, slice->size)) { return Status::InvalidArgument("invalid json binary value: {}", std::string_view(slice->data, slice->size)); } @@ -703,7 +705,8 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap( " in vec engine."); } // Make sure that the json binary data written in is the correct jsonb value. - if (_is_jsonb && !doris::JsonbDocument::checkAndCreateDocument(slice->data, slice->size)) { + if (_is_jsonb && + !doris::JsonbDocument::checkAndCreateDocument(slice->data, slice->size)) { return Status::InvalidArgument("invalid json binary value: {}", std::string_view(slice->data, slice->size)); } diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp index 98fa722be2caaf..05cda27df5c971 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp @@ -545,8 +545,7 @@ TEST_F(ICUTokenizerTest, TestICUEmoji) { "🏴"; tokenize(longWordText, datas); std::vector result = { - "πŸ’©", "πŸ’©", "πŸ’©", "πŸ‘©β€β€οΈβ€πŸ‘©", "πŸ‘¨πŸΌβ€βš•οΈ", "πŸ‡ΊπŸ‡Έ", "πŸ‡ΊπŸ‡Έ", - "#️⃣", "3️⃣", "🏴"}; + "πŸ’©", "πŸ’©", "πŸ’©", "πŸ‘©β€β€οΈβ€πŸ‘©", "πŸ‘¨πŸΌβ€βš•οΈ", "πŸ‡ΊπŸ‡Έ", "πŸ‡ΊπŸ‡Έ", "#️⃣", "3️⃣", "🏴"}; for (size_t i = 0; i < datas.size(); i++) { ASSERT_EQ(datas[i], result[i]); } From 52223fbf7e278bcb141aee89bf4cc4d6a723e57e Mon Sep 17 00:00:00 2001 From: amorynan Date: Tue, 4 Mar 2025 21:50:42 +0800 Subject: [PATCH 4/6] format --- .../segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp index 05cda27df5c971..98fa722be2caaf 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp @@ -545,7 +545,8 @@ TEST_F(ICUTokenizerTest, TestICUEmoji) { "🏴"; tokenize(longWordText, datas); std::vector result = { - "πŸ’©", "πŸ’©", "πŸ’©", "πŸ‘©β€β€οΈβ€πŸ‘©", "πŸ‘¨πŸΌβ€βš•οΈ", "πŸ‡ΊπŸ‡Έ", "πŸ‡ΊπŸ‡Έ", "#️⃣", "3️⃣", "🏴"}; + "πŸ’©", "πŸ’©", "πŸ’©", "πŸ‘©β€β€οΈβ€πŸ‘©", "πŸ‘¨πŸΌβ€βš•οΈ", "πŸ‡ΊπŸ‡Έ", "πŸ‡ΊπŸ‡Έ", + "#️⃣", "3️⃣", "🏴"}; for (size_t i = 0; i < datas.size(); i++) { ASSERT_EQ(datas[i], result[i]); } From b2c1744e7a465926bed0a4f912d1910008b6d566 Mon Sep 17 00:00:00 2001 From: amory Date: Wed, 5 Mar 2025 11:15:02 +0800 Subject: [PATCH 5/6] Update data_type_serde_agg_state_test.cpp --- .../vec/data_types/serde/data_type_serde_agg_state_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/be/test/vec/data_types/serde/data_type_serde_agg_state_test.cpp b/be/test/vec/data_types/serde/data_type_serde_agg_state_test.cpp index 12d2be8ab02811..981d94f42d6367 100644 --- a/be/test/vec/data_types/serde/data_type_serde_agg_state_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_agg_state_test.cpp @@ -111,7 +111,7 @@ TEST_F(AggStateSerdeTest, writeOneCellToJsonb) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto* pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto* pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { datatype_agg_state_serde_count->read_one_cell_from_jsonb(*column_fixed_length, it->value()); @@ -137,7 +137,7 @@ TEST_F(AggStateSerdeTest, writeOneCellToJsonb2) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto* pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto* pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { datatype_agg_state_serde_hll_union->read_one_cell_from_jsonb(*column_string, it->value()); @@ -436,4 +436,4 @@ TEST_F(AggStateSerdeTest, serializeOneCellToHiveText2) { } std::cout << "test serialize/deserialize_one_cell_from_hive_text2 success" << std::endl; } -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized From 1f8dcbe68a9490327156f6977ffbde94d13f35b1 Mon Sep 17 00:00:00 2001 From: amorynan Date: Wed, 5 Mar 2025 15:41:42 +0800 Subject: [PATCH 6/6] rename for test --- be/test/vec/data_types/serde/data_type_serde_bitmap_test.cpp | 2 +- .../serde/data_type_serde_fixed_length_object_test.cpp | 2 +- be/test/vec/data_types/serde/data_type_serde_hll_test.cpp | 2 +- .../data_types/serde/data_type_serde_quantile_state_test.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/be/test/vec/data_types/serde/data_type_serde_bitmap_test.cpp b/be/test/vec/data_types/serde/data_type_serde_bitmap_test.cpp index 4e428cb53cca7e..5ea6067d2a5256 100644 --- a/be/test/vec/data_types/serde/data_type_serde_bitmap_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_bitmap_test.cpp @@ -64,7 +64,7 @@ TEST(BitmapSerdeTest, writeOneCellToJsonb) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto* pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto* pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { bitmap_serde->read_one_cell_from_jsonb(*column_bitmap, it->value()); diff --git a/be/test/vec/data_types/serde/data_type_serde_fixed_length_object_test.cpp b/be/test/vec/data_types/serde/data_type_serde_fixed_length_object_test.cpp index 74d3c1d19e492e..4836b8a12c3643 100644 --- a/be/test/vec/data_types/serde/data_type_serde_fixed_length_object_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_fixed_length_object_test.cpp @@ -70,7 +70,7 @@ TEST(FixedLengthObjectSerdeTest, writeOneCellToJsonb) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto* pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto* pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { fixed_length_serde->read_one_cell_from_jsonb(*column_fixed_length, it->value()); diff --git a/be/test/vec/data_types/serde/data_type_serde_hll_test.cpp b/be/test/vec/data_types/serde/data_type_serde_hll_test.cpp index d496d22abc2797..0d24c1ad1ff716 100644 --- a/be/test/vec/data_types/serde/data_type_serde_hll_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_hll_test.cpp @@ -69,7 +69,7 @@ TEST(HLLSerdeTest, writeOneCellToJsonb) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto* pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto* pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { hll_serde->read_one_cell_from_jsonb(*column_hll, it->value()); diff --git a/be/test/vec/data_types/serde/data_type_serde_quantile_state_test.cpp b/be/test/vec/data_types/serde/data_type_serde_quantile_state_test.cpp index d280dca0cdec91..7e7909da792326 100644 --- a/be/test/vec/data_types/serde/data_type_serde_quantile_state_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_quantile_state_test.cpp @@ -71,7 +71,7 @@ TEST(QuantileStateSerdeTest, writeOneCellToJsonb) { jsonb_column->insert_data(jsonb_writer.getOutput()->getBuffer(), jsonb_writer.getOutput()->getSize()); StringRef jsonb_data = jsonb_column->get_data_at(0); - auto* pdoc = JsonbDocument::createDocument(jsonb_data.data, jsonb_data.size); + auto* pdoc = JsonbDocument::checkAndCreateDocument(jsonb_data.data, jsonb_data.size); JsonbDocument& doc = *pdoc; for (auto it = doc->begin(); it != doc->end(); ++it) { quantile_state_serde->read_one_cell_from_jsonb(*column_quantile_state, it->value());