From d51c6da75be8ea508dbd919103787d1a17db892f Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Mon, 23 May 2022 12:40:00 +0800 Subject: [PATCH 1/6] add char type padding --- be/src/vec/columns/column_string.h | 11 +++++++++-- be/src/vec/olap/olap_data_convertor.cpp | 15 +++++++++++++++ be/src/vec/olap/olap_data_convertor.h | 2 ++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 47cb68c4fab712..40231527a07f46 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -75,6 +75,8 @@ class ColumnString final : public COWHelper { size_t size() const override { return offsets.size(); } + size_t chars_size() const { return chars.size(); } + size_t byte_size() const override { return chars.size() + offsets.size() * sizeof(offsets[0]); } size_t allocated_bytes() const override { @@ -148,12 +150,17 @@ class ColumnString final : public COWHelper { } void insert_data(const char* pos, size_t length) override { + insert_data_padded(pos, length, length); + } + + // if lengthsize(); + if (column_string->chars_size() != _length * rows) { + _column = vectorized::ColumnString::create(); + auto padded_column = + assert_cast(_column->assume_mutable().get()); + padded_column->resize(rows); + + for (size_t i = 0; i < rows; i++) { + auto ref = column_string->get_data_at(i); + padded_column->insert_data_padded(ref.data, ref.size, _length); + } + + column_string = assert_cast(_column.get()); + } + const ColumnString::Char* char_data = column_string->get_chars().data(); const ColumnString::Offset* offset_cur = column_string->get_offsets().data() + _row_pos; const ColumnString::Offset* offset_end = offset_cur + _num_rows; diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index eb104b1414d8ca..fd87b05477503b 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -17,6 +17,7 @@ #pragma once #include "olap/tablet_schema.h" +#include "vec/columns/column.h" #include "vec/core/block.h" namespace doris::vectorized { @@ -101,6 +102,7 @@ class OlapBlockDataConvertor { private: size_t _length; PaddedPODArray _slice; + ColumnPtr _column = nullptr; }; class OlapColumnDataConvertorVarChar : public OlapColumnDataConvertorBase { From 8c85c43a2e922aba71a73563e92894163e2ee0f1 Mon Sep 17 00:00:00 2001 From: Pxl Date: Mon, 23 May 2022 14:48:59 +0800 Subject: [PATCH 2/6] Update be/src/vec/olap/olap_data_convertor.cpp Co-authored-by: camby <104178625@qq.com> --- be/src/vec/olap/olap_data_convertor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index ee54cdb7b209dc..88a422735714d5 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -363,7 +363,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { _column = vectorized::ColumnString::create(); auto padded_column = assert_cast(_column->assume_mutable().get()); - padded_column->resize(rows); + padded_column->reserve(rows); for (size_t i = 0; i < rows; i++) { auto ref = column_string->get_data_at(i); From ca3f917165bdf03e9729b1b6604a3b06c88798c7 Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Mon, 23 May 2022 16:58:15 +0800 Subject: [PATCH 3/6] update --- be/CMakeLists.txt | 2 +- be/src/vec/columns/column_string.h | 12 +--- be/src/vec/olap/olap_data_convertor.cpp | 14 +---- be/src/vec/olap/olap_data_convertor.h | 34 +++++++++++ be/test/CMakeLists.txt | 1 + be/test/vec/exec/vtablet_sink_test.cpp | 2 +- be/test/vec/olap/char_type_padding_test.cpp | 68 +++++++++++++++++++++ 7 files changed, 111 insertions(+), 22 deletions(-) create mode 100644 be/test/vec/olap/char_type_padding_test.cpp diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 7c532583261215..6079cfa5fc96cc 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -396,7 +396,7 @@ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DBOOST_SYSTEM_NO_DEPRECATED") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DBRPC_ENABLE_CPU_PROFILER") if (USE_LLD) - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fuse-ld=lld") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld") endif () if (USE_LIBCPP AND COMPILER_CLANG) diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 40231527a07f46..421255a77bd05d 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -45,6 +45,7 @@ class ColumnString final : public COWHelper { private: friend class COWHelper; + friend class OlapBlockDataConvertor; /// Maps i'th position to offset to i+1'th element. Last offset maps to the end of all chars (is the size of all chars). Offsets offsets; @@ -75,8 +76,6 @@ class ColumnString final : public COWHelper { size_t size() const override { return offsets.size(); } - size_t chars_size() const { return chars.size(); } - size_t byte_size() const override { return chars.size() + offsets.size() * sizeof(offsets[0]); } size_t allocated_bytes() const override { @@ -150,17 +149,12 @@ class ColumnString final : public COWHelper { } void insert_data(const char* pos, size_t length) override { - insert_data_padded(pos, length, length); - } - - // if lengthsize(); - if (column_string->chars_size() != _length * rows) { - _column = vectorized::ColumnString::create(); - auto padded_column = - assert_cast(_column->assume_mutable().get()); - padded_column->reserve(rows); - - for (size_t i = 0; i < rows; i++) { - auto ref = column_string->get_data_at(i); - padded_column->insert_data_padded(ref.data, ref.size, _length); - } - + // If column_string is not padded to full, we should do padding here. + if (should_padding(column_string, _length)) { + _column = clone_and_padding(column_string, rows); column_string = assert_cast(_column.get()); } diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index fd87b05477503b..b9c4ff32674793 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -18,6 +18,8 @@ #pragma once #include "olap/tablet_schema.h" #include "vec/columns/column.h" +#include "vec/columns/column_string.h" +#include "vec/common/string_ref.h" #include "vec/core/block.h" namespace doris::vectorized { @@ -100,6 +102,38 @@ class OlapBlockDataConvertor { Status convert_to_olap() override; private: + static bool should_padding(const ColumnString* column, size_t padding_length) { + // Check sum of data length, including terminating zero. + return column->size() * (padding_length + 1) != column->chars.size(); + } + + static void insert_data_padded(ColumnString* column, StringRef str, size_t padding_length) { + const size_t old_size = column->chars.size(); + const size_t data_size = old_size + str.size; + const size_t full_size = old_size + padding_length + 1; + + column->chars.resize(full_size); + column->offsets.push_back(full_size); + + if (str.size) { + memcpy(column->chars.data() + old_size, str.data, str.size); + } + memset(column->chars.data() + data_size, 0, full_size - data_size); + } + + static ColumnPtr clone_and_padding(const ColumnString* input, size_t padding_length) { + auto column = vectorized::ColumnString::create(); + auto padded_column = + assert_cast(column->assume_mutable().get()); + padded_column->reserve(input->size()); + + for (size_t i = 0; i < input->size(); i++) { + insert_data_padded(padded_column, input->get_data_at(i), padding_length); + } + + return column; + } + size_t _length; PaddedPODArray _slice; ColumnPtr _column = nullptr; diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index 38700a4bc2e60c..4bafdb1aae23c0 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -358,6 +358,7 @@ set(VEC_TEST_FILES vec/function/table_function_test.cpp vec/runtime/vdata_stream_test.cpp vec/utils/arrow_column_to_doris_column_test.cpp + vec/olap/char_type_padding_test.cpp ) add_executable(doris_be_test diff --git a/be/test/vec/exec/vtablet_sink_test.cpp b/be/test/vec/exec/vtablet_sink_test.cpp index d8ab7da2c33021..67ae97128da59d 100644 --- a/be/test/vec/exec/vtablet_sink_test.cpp +++ b/be/test/vec/exec/vtablet_sink_test.cpp @@ -115,7 +115,7 @@ class VTestInternalService : public PBackendService { void tablet_writer_add_block(google::protobuf::RpcController* controller, const PTabletWriterAddBlockRequest* request, PTabletWriterAddBlockResult* response, - google::protobuf::Closure* done) { + google::protobuf::Closure* done) override { brpc::ClosureGuard done_guard(done); { std::lock_guard l(_lock); diff --git a/be/test/vec/olap/char_type_padding_test.cpp b/be/test/vec/olap/char_type_padding_test.cpp new file mode 100644 index 00000000000000..fa7eaffc317bd7 --- /dev/null +++ b/be/test/vec/olap/char_type_padding_test.cpp @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "vec/columns/column_string.h" +#include "vec/olap/olap_data_convertor.h" + +namespace doris::vectorized { + +using ConvertorChar = OlapBlockDataConvertor::OlapColumnDataConvertorChar; + +TEST(CharTypePaddingTest, CharTypePaddingFullTest) { + auto input = ColumnString::create(); + + std::string str = "Allemande"; + size_t rows = 10; + + for (size_t i = 0; i < rows; i++) { + input->insert_data(str.data(), str.length()); + } + EXPECT_FALSE(ConvertorChar::should_padding(input, str.length())); + + input->insert_data(str.data(), str.length() - 1); + EXPECT_TRUE(ConvertorChar::should_padding(input, str.length())); +} + +TEST(CharTypePaddingTest, CharTypePaddingDataTest) { + auto input = ColumnString::create(); + + std::string str = "Allemande"; + + size_t rows = str.length(); + for (int i = 0; i < rows; i++) { + input->insert_data(str.data(), str.length() - i); + } + + auto output = ConvertorChar::clone_and_padding(input, str.length()); + + for (int i = 0; i < rows; i++) { + auto cell = output->get_data_at(i).to_string(); + EXPECT_EQ(cell.length(), str.length()); + + auto str_real = std::string(cell.data(), str.length() - i); + auto str_expect = str.substr(0, str.length() - i); + EXPECT_EQ(str_real, str_expect); + + for (int j = str.length() - i; j < str.length(); j++) { + EXPECT_EQ(cell[j], 0); + } + } +} + +} // namespace doris::vectorized From f3e132b9fdf4ca6e505b22ff93cd94843fca0e14 Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Tue, 24 May 2022 11:13:40 +0800 Subject: [PATCH 4/6] update --- be/src/vec/olap/olap_data_convertor.h | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index b9c4ff32674793..d0f5d01525df2f 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -107,28 +107,23 @@ class OlapBlockDataConvertor { return column->size() * (padding_length + 1) != column->chars.size(); } - static void insert_data_padded(ColumnString* column, StringRef str, size_t padding_length) { - const size_t old_size = column->chars.size(); - const size_t data_size = old_size + str.size; - const size_t full_size = old_size + padding_length + 1; - - column->chars.resize(full_size); - column->offsets.push_back(full_size); - - if (str.size) { - memcpy(column->chars.data() + old_size, str.data, str.size); - } - memset(column->chars.data() + data_size, 0, full_size - data_size); - } - static ColumnPtr clone_and_padding(const ColumnString* input, size_t padding_length) { auto column = vectorized::ColumnString::create(); auto padded_column = assert_cast(column->assume_mutable().get()); - padded_column->reserve(input->size()); + + column->offsets.resize(input->size()); + column->chars.resize(input->size() * (padding_length + 1)); + memset(padded_column->chars.data(), 0, input->size() * (padding_length + 1)); for (size_t i = 0; i < input->size(); i++) { - insert_data_padded(padded_column, input->get_data_at(i), padding_length); + column->offsets[i] = (i + 1) * (padding_length + 1); + + auto str = input->get_data_at(i); + if (str.size) { + memcpy(padded_column->chars.data() + i * (padding_length + 1), str.data, + str.size); + } } return column; From 2d03d4ba90ab286f2100ac6b4d817d84e3744343 Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Wed, 25 May 2022 16:30:38 +0800 Subject: [PATCH 5/6] fix wrong argument pass --- be/src/vec/olap/olap_data_convertor.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index 0cc26bdadd5a1d..d1910919b5d546 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -358,10 +358,9 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { assert(column_string); - size_t rows = column_string->size(); // If column_string is not padded to full, we should do padding here. if (should_padding(column_string, _length)) { - _column = clone_and_padding(column_string, rows); + _column = clone_and_padding(column_string, _length); column_string = assert_cast(_column.get()); } From 50787794ff501a364626eb6a7594f452b2fc9345 Mon Sep 17 00:00:00 2001 From: BiteTheDDDDt Date: Thu, 26 May 2022 11:42:10 +0800 Subject: [PATCH 6/6] fix clang build --- be/src/olap/memtable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index cf58f7e4ba6265..77b03130049542 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -282,7 +282,7 @@ void MemTable::_collect_vskiplist_results() { _input_mutable_block.swap(_output_mutable_block); //TODO(weixang):opt here. std::unique_ptr empty_input_block = - std::move(in_block.create_same_struct_block(0)); + in_block.create_same_struct_block(0); _output_mutable_block = vectorized::MutableBlock::build_mutable_block(empty_input_block.get()); _output_mutable_block.clear_column_data();