From e5adcffef4c33af3f533230e9420bffa4ca2e9bd Mon Sep 17 00:00:00 2001 From: HappenLee Date: Thu, 11 Apr 2024 01:23:05 +0800 Subject: [PATCH 01/10] save column append data --- be/src/exec/rowid_fetcher.h | 2 +- be/src/runtime/primitive_type.h | 5 +- .../aggregate_function_orthogonal_bitmap.h | 2 +- be/src/vec/columns/column.h | 8 +- be/src/vec/columns/column_string.cpp | 180 ++++++++++++------ be/src/vec/columns/column_string.h | 58 +++--- .../format/parquet/byte_array_dict_decoder.h | 2 +- be/src/vec/exec/format/parquet/decoder.h | 10 +- .../parquet/vparquet_column_chunk_reader.h | 2 +- .../format/parquet/vparquet_column_reader.cpp | 3 - .../format/parquet/vparquet_column_reader.h | 2 +- be/src/vec/exec/format/table/iceberg_reader.h | 2 +- .../vec/exprs/table_function/vexplode_split.h | 2 +- .../functions/array/function_arrays_overlap.h | 5 +- be/src/vec/functions/in.h | 7 +- be/src/vec/json/parse2column.h | 2 +- be/src/vec/jsonb/serialize.h | 4 - 17 files changed, 168 insertions(+), 128 deletions(-) diff --git a/be/src/exec/rowid_fetcher.h b/be/src/exec/rowid_fetcher.h index 78184ae8febb14..7ca2ef19143b0a 100644 --- a/be/src/exec/rowid_fetcher.h +++ b/be/src/exec/rowid_fetcher.h @@ -27,6 +27,7 @@ #include "common/status.h" #include "exec/tablet_info.h" // DorisNodesInfo #include "olap/storage_engine.h" +#include "vec/columns/column_string.h" #include "vec/core/block.h" #include "vec/data_types/data_type.h" @@ -37,7 +38,6 @@ class RuntimeState; class TupleDescriptor; namespace vectorized { -class ColumnString; class MutableBlock; } // namespace vectorized diff --git a/be/src/runtime/primitive_type.h b/be/src/runtime/primitive_type.h index d6a4354c9a6d5c..0e74a86d76c94e 100644 --- a/be/src/runtime/primitive_type.h +++ b/be/src/runtime/primitive_type.h @@ -28,6 +28,7 @@ #include "olap/decimal12.h" #include "runtime/define_primitive_type.h" #include "vec/columns/column_decimal.h" +#include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/columns/columns_number.h" #include "vec/core/types.h" @@ -36,10 +37,6 @@ namespace doris { -namespace vectorized { -class ColumnString; -} // namespace vectorized - class DecimalV2Value; struct StringRef; struct JsonBinaryValue; diff --git a/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h index f0fd67f4a851da..5877020eaf92eb 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h @@ -33,6 +33,7 @@ #include "util/bitmap_value.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column_complex.h" +#include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/core/types.h" #include "vec/data_types/data_type_bitmap.h" @@ -44,7 +45,6 @@ namespace vectorized { class Arena; class BufferReadable; class BufferWritable; -class ColumnString; class IColumn; } // namespace vectorized } // namespace doris diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index a6d48a41fca619..a03b30375dd59a 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -77,14 +77,13 @@ class IColumn : public COW { /// If you want to copy column for modification, look at 'mutate' method. virtual MutablePtr clone() const = 0; -protected: +public: // 64bit offsets now only Array type used, so we make it protected // to avoid use IColumn::Offset64 directly. // please use ColumnArray::Offset64 instead if we need. using Offset64 = UInt64; using Offsets64 = PaddedPODArray; -public: // 32bit offsets for string using Offset = UInt32; using Offsets = PaddedPODArray; @@ -100,6 +99,11 @@ class IColumn : public COW { */ virtual Ptr convert_to_full_column_if_const() const { return get_ptr(); } + /** If column isn't constant, returns nullptr (or itself). + * If column is constant, transforms constant to full column (if column type allows such transform) and return it. + */ + virtual Ptr convert_to_full_column_if_overflow() { return get_ptr(); } + /// If column isn't ColumnLowCardinality, return itself. /// If column is ColumnLowCardinality, transforms is to full column. virtual Ptr convert_to_full_column_if_low_cardinality() const { return get_ptr(); } diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 76db0e58c44580..10593bd031fcef 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. // This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnString.cpp +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnStr.cpp // and modified by Doris #include "vec/columns/column_string.h" @@ -31,11 +31,11 @@ #include "vec/common/memcmp_small.h" #include "vec/common/unaligned.h" #include "vec/core/sort_block.h" -#include "vec/data_types/data_type.h" namespace doris::vectorized { -void ColumnString::sanity_check() const { +template +void ColumnStr::sanity_check() const { auto count = offsets.size(); if (chars.size() != offsets[count - 1]) { LOG(FATAL) << "row count: " << count << ", chars.size(): " << chars.size() << ", offset[" @@ -52,8 +52,9 @@ void ColumnString::sanity_check() const { } } -MutableColumnPtr ColumnString::clone_resized(size_t to_size) const { - auto res = ColumnString::create(); +template +MutableColumnPtr ColumnStr::clone_resized(size_t to_size) const { + auto res = ColumnStr::create(); if (to_size == 0) { return res; } @@ -79,29 +80,31 @@ MutableColumnPtr ColumnString::clone_resized(size_t to_size) const { return res; } -MutableColumnPtr ColumnString::get_shrinked_column() { - auto shrinked_column = ColumnString::create(); +template +MutableColumnPtr ColumnStr::get_shrinked_column() { + auto shrinked_column = ColumnStr::create(); shrinked_column->get_offsets().reserve(offsets.size()); shrinked_column->get_chars().reserve(chars.size()); for (int i = 0; i < size(); i++) { StringRef str = get_data_at(i); - reinterpret_cast(shrinked_column.get()) + reinterpret_cast*>(shrinked_column.get()) ->insert_data(str.data, strnlen(str.data, str.size)); } return shrinked_column; } -void ColumnString::insert_range_from(const IColumn& src, size_t start, size_t length) { +template +void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t length) { if (length == 0) { return; } - const ColumnString& src_concrete = assert_cast(src); + const ColumnStr& src_concrete = assert_cast&>(src); if (start + length > src_concrete.offsets.size()) { throw doris::Exception( doris::ErrorCode::INTERNAL_ERROR, - "Parameter out of bound in IColumnString::insert_range_from method."); + "Parameter out of bound in IColumnStr::insert_range_from method."); } size_t nested_offset = src_concrete.offset_at(start); @@ -126,9 +129,10 @@ void ColumnString::insert_range_from(const IColumn& src, size_t start, size_t le } } -void ColumnString::insert_indices_from(const IColumn& src, const uint32_t* indices_begin, +template +void ColumnStr::insert_indices_from(const IColumn& src, const uint32_t* indices_begin, const uint32_t* indices_end) { - const auto& src_str = assert_cast(src); + const auto& src_str = assert_cast&>(src); const auto* src_offset_data = src_str.offsets.data(); auto old_char_size = chars.size(); @@ -159,7 +163,8 @@ void ColumnString::insert_indices_from(const IColumn& src, const uint32_t* indic } } -void ColumnString::update_crcs_with_value(uint32_t* __restrict hashes, doris::PrimitiveType type, +template +void ColumnStr::update_crcs_with_value(uint32_t* __restrict hashes, doris::PrimitiveType type, uint32_t rows, uint32_t offset, const uint8_t* __restrict null_data) const { auto s = rows; @@ -180,45 +185,50 @@ void ColumnString::update_crcs_with_value(uint32_t* __restrict hashes, doris::Pr } } -ColumnPtr ColumnString::filter(const Filter& filt, ssize_t result_size_hint) const { +template +ColumnPtr ColumnStr::filter(const IColumn::Filter& filt, ssize_t result_size_hint) const { if (offsets.size() == 0) { - return ColumnString::create(); + return ColumnStr::create(); } - auto res = ColumnString::create(); + auto res = ColumnStr::create(); Chars& res_chars = res->chars; - Offsets& res_offsets = res->offsets; + IColumn::Offsets& res_offsets = res->offsets; - filter_arrays_impl(chars, offsets, res_chars, res_offsets, filt, - result_size_hint); + filter_arrays_impl(chars, offsets, res_chars, res_offsets, filt, + result_size_hint); return res; } -size_t ColumnString::filter(const Filter& filter) { +template +size_t ColumnStr::filter(const IColumn::Filter& filter) { CHECK_EQ(filter.size(), offsets.size()); if (offsets.size() == 0) { resize(0); return 0; } - return filter_arrays_impl(chars, offsets, filter); + return filter_arrays_impl(chars, offsets, filter); } -Status ColumnString::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { - auto* col = static_cast(col_ptr); +template +Status ColumnStr::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { + auto* col = static_cast*>(col_ptr); Chars& res_chars = col->chars; - Offsets& res_offsets = col->offsets; - Filter filter; + IColumn::Offsets& res_offsets = col->offsets; + IColumn::Filter filter; filter.resize_fill(offsets.size(), 0); for (size_t i = 0; i < sel_size; i++) { filter[sel[i]] = 1; } - filter_arrays_impl(chars, offsets, res_chars, res_offsets, filter, sel_size); + filter_arrays_impl(chars, offsets, res_chars, res_offsets, filter, + sel_size); return Status::OK(); } -ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const { +template +ColumnPtr ColumnStr::permute(const IColumn::Permutation& perm, size_t limit) const { size_t size = offsets.size(); if (limit == 0) { @@ -232,13 +242,13 @@ ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const { } if (limit == 0) { - return ColumnString::create(); + return ColumnStr::create(); } - auto res = ColumnString::create(); + auto res = ColumnStr::create(); Chars& res_chars = res->chars; - Offsets& res_offsets = res->offsets; + IColumn::Offsets& res_offsets = res->offsets; if (limit == size) { res_chars.resize(chars.size()); @@ -252,7 +262,7 @@ ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const { res_offsets.resize(limit); - Offset current_new_offset = 0; + IColumn::Offset current_new_offset = 0; for (size_t i = 0; i < limit; ++i) { size_t j = perm[i]; @@ -269,7 +279,8 @@ ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const { return res; } -StringRef ColumnString::serialize_value_into_arena(size_t n, Arena& arena, +template +StringRef ColumnStr::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const { uint32_t string_size(size_at(n)); uint32_t offset(offset_at(n)); @@ -284,7 +295,8 @@ StringRef ColumnString::serialize_value_into_arena(size_t n, Arena& arena, return res; } -const char* ColumnString::deserialize_and_insert_from_arena(const char* pos) { +template +const char* ColumnStr::deserialize_and_insert_from_arena(const char* pos) { const uint32_t string_size = unaligned_load(pos); pos += sizeof(string_size); @@ -298,7 +310,8 @@ const char* ColumnString::deserialize_and_insert_from_arena(const char* pos) { return pos + string_size; } -size_t ColumnString::get_max_row_byte_size() const { +template +size_t ColumnStr::get_max_row_byte_size() const { size_t max_size = 0; size_t num_rows = offsets.size(); for (size_t i = 0; i < num_rows; ++i) { @@ -308,7 +321,8 @@ size_t ColumnString::get_max_row_byte_size() const { return max_size + sizeof(uint32_t); } -void ColumnString::serialize_vec(std::vector& keys, size_t num_rows, +template +void ColumnStr::serialize_vec(std::vector& keys, size_t num_rows, size_t max_row_byte_size) const { for (size_t i = 0; i < num_rows; ++i) { uint32_t offset(offset_at(i)); @@ -321,7 +335,8 @@ void ColumnString::serialize_vec(std::vector& keys, size_t num_rows, } } -void ColumnString::serialize_vec_with_null_map(std::vector& keys, size_t num_rows, +template +void ColumnStr::serialize_vec_with_null_map(std::vector& keys, size_t num_rows, const uint8_t* null_map) const { for (size_t i = 0; i < num_rows; ++i) { if (null_map[i] == 0) { @@ -336,7 +351,8 @@ void ColumnString::serialize_vec_with_null_map(std::vector& keys, siz } } -void ColumnString::deserialize_vec(std::vector& keys, const size_t num_rows) { +template +void ColumnStr::deserialize_vec(std::vector& keys, const size_t num_rows) { for (size_t i = 0; i != num_rows; ++i) { auto original_ptr = keys[i].data; keys[i].data = deserialize_and_insert_from_arena(original_ptr); @@ -344,7 +360,8 @@ void ColumnString::deserialize_vec(std::vector& keys, const size_t nu } } -void ColumnString::deserialize_vec_with_null_map(std::vector& keys, +template +void ColumnStr::deserialize_vec_with_null_map(std::vector& keys, const size_t num_rows, const uint8_t* null_map) { for (size_t i = 0; i != num_rows; ++i) { if (null_map[i] == 0) { @@ -357,16 +374,17 @@ void ColumnString::deserialize_vec_with_null_map(std::vector& keys, } } +template template -ColumnPtr ColumnString::index_impl(const PaddedPODArray& indexes, size_t limit) const { +ColumnPtr ColumnStr::index_impl(const PaddedPODArray& indexes, size_t limit) const { if (limit == 0) { - return ColumnString::create(); + return ColumnStr::create(); } - auto res = ColumnString::create(); + auto res = ColumnStr::create(); Chars& res_chars = res->chars; - Offsets& res_offsets = res->offsets; + IColumn::Offsets& res_offsets = res->offsets; size_t new_chars_size = 0; for (size_t i = 0; i < limit; ++i) { @@ -377,7 +395,7 @@ ColumnPtr ColumnString::index_impl(const PaddedPODArray& indexes, size_t l res_offsets.resize(limit); - Offset current_new_offset = 0; + IColumn::Offset current_new_offset = 0; for (size_t i = 0; i < limit; ++i) { size_t j = indexes[i]; @@ -394,10 +412,11 @@ ColumnPtr ColumnString::index_impl(const PaddedPODArray& indexes, size_t l return res; } +template template -struct ColumnString::less { - const ColumnString& parent; - explicit less(const ColumnString& parent_) : parent(parent_) {} +struct ColumnStr::less { + const ColumnStr& parent; + explicit less(const ColumnStr& parent_) : parent(parent_) {} bool operator()(size_t lhs, size_t rhs) const { int res = memcmp_small_allow_overflow15( parent.chars.data() + parent.offset_at(lhs), parent.size_at(lhs), @@ -407,8 +426,9 @@ struct ColumnString::less { } }; -void ColumnString::get_permutation(bool reverse, size_t limit, int /*nan_direction_hint*/, - Permutation& res) const { +template +void ColumnStr::get_permutation(bool reverse, size_t limit, int /*nan_direction_hint*/, + IColumn::Permutation& res) const { size_t s = offsets.size(); res.resize(s); for (size_t i = 0; i < s; ++i) { @@ -434,24 +454,25 @@ void ColumnString::get_permutation(bool reverse, size_t limit, int /*nan_directi } } -ColumnPtr ColumnString::replicate(const Offsets& replicate_offsets) const { +template +ColumnPtr ColumnStr::replicate(const IColumn::Offsets& replicate_offsets) const { size_t col_size = size(); column_match_offsets_size(col_size, replicate_offsets.size()); - auto res = ColumnString::create(); + auto res = ColumnStr::create(); if (0 == col_size) { return res; } Chars& res_chars = res->chars; - Offsets& res_offsets = res->offsets; + IColumn::Offsets& res_offsets = res->offsets; res_chars.reserve(chars.size() / col_size * replicate_offsets.back()); res_offsets.reserve(replicate_offsets.back()); - Offset prev_replicate_offset = 0; - Offset prev_string_offset = 0; - Offset current_new_offset = 0; + IColumn::Offset prev_replicate_offset = 0; + IColumn::Offset prev_string_offset = 0; + IColumn::Offset current_new_offset = 0; for (size_t i = 0; i < col_size; ++i) { size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; @@ -474,12 +495,14 @@ ColumnPtr ColumnString::replicate(const Offsets& replicate_offsets) const { return res; } -void ColumnString::reserve(size_t n) { +template +void ColumnStr::reserve(size_t n) { offsets.reserve(n); chars.reserve(n); } -void ColumnString::resize(size_t n) { +template +void ColumnStr::resize(size_t n) { auto origin_size = size(); if (origin_size > n) { offsets.resize(n); @@ -488,18 +511,20 @@ void ColumnString::resize(size_t n) { } } -void ColumnString::sort_column(const ColumnSorter* sorter, EqualFlags& flags, +template +void ColumnStr::sort_column(const ColumnSorter* sorter, EqualFlags& flags, IColumn::Permutation& perms, EqualRange& range, bool last_column) const { - sorter->sort_column(static_cast(*this), flags, perms, range, last_column); + sorter->sort_column(static_cast&>(*this), flags, perms, range, last_column); } -void ColumnString::compare_internal(size_t rhs_row_id, const IColumn& rhs, int nan_direction_hint, +template +void ColumnStr::compare_internal(size_t rhs_row_id, const IColumn& rhs, int nan_direction_hint, int direction, std::vector& cmp_res, uint8* __restrict filter) const { auto sz = this->size(); DCHECK(cmp_res.size() == sz); - const auto& cmp_base = assert_cast(rhs).get_data_at(rhs_row_id); + const auto& cmp_base = assert_cast&>(rhs).get_data_at(rhs_row_id); size_t begin = simd::find_zero(cmp_res, 0); while (begin < sz) { size_t end = simd::find_one(cmp_res, begin + 1); @@ -518,8 +543,39 @@ void ColumnString::compare_internal(size_t rhs_row_id, const IColumn& rhs, int n } } -ColumnPtr ColumnString::index(const IColumn& indexes, size_t limit) const { +template +ColumnPtr ColumnStr::index(const IColumn& indexes, size_t limit) const { return select_index_impl(*this, indexes, limit); } +template +ColumnPtr ColumnStr::convert_to_full_column_if_overflow() { + if (std::is_same_v && chars.size() > std::numeric_limits::max()) { + auto new_col = ColumnStr::create(); + + const auto length = offsets.size(); + std::swap(new_col->get_chars(), chars); + new_col->get_offsets().resize(length); + + size_t base = 0; + size_t start = 0; + size_t end = length; + + while (start < end) { + size_t mid = find_first_overflow_point(_offsets, start, end); + for (size_t i = start; i < mid; i++) { + new_column->get_offset()[i] = static_cast(_offsets[i]) + base; + } + base += Column::MAX_CAPACITY_LIMIT; + start = mid; + } + + offsets.clear(); + return new_col; + } + return this->get_ptr(); +} + +template class ColumnStr; +template class ColumnStr; } // namespace doris::vectorized diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index c2eedfbc7911d4..aed9354f8dc318 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. // This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnString.h +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnStr.h // and modified by Doris #pragma once @@ -58,7 +58,8 @@ namespace doris::vectorized { /** Column for String values. */ -class ColumnString final : public COWHelper { +template +class ColumnStr final : public COWHelper> { public: using Char = UInt8; using Chars = PaddedPODArray; @@ -75,14 +76,14 @@ class ColumnString final : public COWHelper { private: // currently Offsets is uint32, if chars.size() exceeds 4G, offset will overflow. - // limit chars.size() and check the size when inserting data into ColumnString. + // limit chars.size() and check the size when inserting data into ColumnStr. static constexpr size_t MAX_STRING_SIZE = 0xffffffff; - friend class COWHelper; + friend class COWHelper>; friend class OlapBlockDataConvertor; /// Maps i'th position to offset to i+1'th element. Last offset maps to the end of all chars (is the size of all chars). - Offsets offsets; + PaddedPODArray offsets; /// Bytes of strings, placed contiguously. /// For convenience, every string ends with terminating zero byte. Note that strings could contain zero bytes in the middle. @@ -98,9 +99,9 @@ class ColumnString final : public COWHelper { template struct lessWithCollation; - ColumnString() = default; + ColumnStr() = default; - ColumnString(const ColumnString& src) + ColumnStr(const ColumnStr& src) : offsets(src.offsets.begin(), src.offsets.end()), chars(src.chars.begin(), src.chars.end()) {} @@ -164,7 +165,7 @@ class ColumnString final : public COWHelper { } void insert_from(const IColumn& src_, size_t n) override { - const ColumnString& src = assert_cast(src_); + const ColumnStr& src = assert_cast&>(src_); const size_t size_to_append = src.offsets[n] - src.offsets[n - 1]; /// -1th index is Ok, see PaddedPODArray. @@ -244,7 +245,6 @@ class ColumnString final : public COWHelper { void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets_, const size_t num) override { - static_assert(sizeof(offsets_[0]) == sizeof(*offsets.data())); if (UNLIKELY(num == 0)) { return; } @@ -313,9 +313,9 @@ class ColumnString final : public COWHelper { } } - template - void insert_many_strings_fixed_length(const StringRef* strings, size_t num) - __attribute__((noinline)); + // template + // void insert_many_strings_fixed_length(const StringRef* strings, size_t num) + // __attribute__((noinline)); template void insert_many_strings_fixed_length(const StringRef* strings, size_t num) { @@ -483,12 +483,12 @@ class ColumnString final : public COWHelper { void insert_indices_from(const IColumn& src, const uint32_t* indices_begin, const uint32_t* indices_end) override; - ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override; - size_t filter(const Filter& filter) override; + ColumnPtr filter(const IColumn::Filter& filt, ssize_t result_size_hint) const override; + size_t filter(const IColumn::Filter& filter) override; Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override; - ColumnPtr permute(const Permutation& perm, size_t limit) const override; + ColumnPtr permute(const IColumn::Permutation& perm, size_t limit) const override; void sort_column(const ColumnSorter* sorter, EqualFlags& flags, IColumn::Permutation& perms, EqualRange& range, bool last_column) const override; @@ -506,19 +506,19 @@ class ColumnString final : public COWHelper { int compare_at(size_t n, size_t m, const IColumn& rhs_, int /*nan_direction_hint*/) const override { - const ColumnString& rhs = assert_cast(rhs_); + const ColumnStr& rhs = assert_cast&>(rhs_); return memcmp_small_allow_overflow15(chars.data() + offset_at(n), size_at(n), rhs.chars.data() + rhs.offset_at(m), rhs.size_at(m)); } void get_permutation(bool reverse, size_t limit, int nan_direction_hint, - Permutation& res) const override; + IColumn::Permutation& res) const override; - ColumnPtr replicate(const Offsets& replicate_offsets) const override; + ColumnPtr replicate(const IColumn::Offsets& replicate_offsets) const override; void append_data_by_selector(MutableColumnPtr& res, const IColumn::Selector& selector) const override { - append_data_by_selector_impl(res, selector); + this->template append_data_by_selector_impl>(res, selector); } void append_data_by_selector(MutableColumnPtr& res, const IColumn::Selector& selector, @@ -534,14 +534,14 @@ class ColumnString final : public COWHelper { bool is_column_string() const override { return true; } bool structure_equals(const IColumn& rhs) const override { - return typeid(rhs) == typeid(ColumnString); + return typeid(rhs) == typeid(ColumnStr); } Chars& get_chars() { return chars; } const Chars& get_chars() const { return chars; } - Offsets& get_offsets() { return offsets; } - const Offsets& get_offsets() const { return offsets; } + auto& get_offsets() { return offsets; } + const auto& get_offsets() const { return offsets; } void clear() override { chars.clear(); @@ -561,25 +561,29 @@ class ColumnString final : public COWHelper { int direction, std::vector& cmp_res, uint8* __restrict filter) const override; MutableColumnPtr get_shinked_column() const { - auto shrinked_column = ColumnString::create(); + auto shrinked_column = ColumnStr::create(); for (int i = 0; i < size(); i++) { StringRef str = get_data_at(i); - reinterpret_cast(shrinked_column.get()) + reinterpret_cast*>(shrinked_column.get()) ->insert_data(str.data, strnlen(str.data, str.size)); } return shrinked_column; } - void get_indices_of_non_default_rows(Offsets64& indices, size_t from, + void get_indices_of_non_default_rows(IColumn::Offsets64& indices, size_t from, size_t limit) const override { - return get_indices_of_non_default_rows_impl(indices, from, limit); + return this->template get_indices_of_non_default_rows_impl>(indices, from, + limit); } ColumnPtr index(const IColumn& indexes, size_t limit) const override; double get_ratio_of_default_rows(double sample_ratio) const override { - return get_ratio_of_default_rows_impl(sample_ratio); + return this->template get_ratio_of_default_rows_impl>(sample_ratio); } + + ColumnPtr convert_to_full_column_if_overflow() override; }; +using ColumnString = ColumnStr; } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h index 0267cf17f755d9..91f7a590648335 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h @@ -26,6 +26,7 @@ #include "common/status.h" #include "util/bit_util.h" +#include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" #include "vec/common/string_ref.h" #include "vec/core/types.h" @@ -36,7 +37,6 @@ namespace doris { namespace vectorized { -class ColumnString; template class ColumnDecimal; } // namespace vectorized diff --git a/be/src/vec/exec/format/parquet/decoder.h b/be/src/vec/exec/format/parquet/decoder.h index efb7d3a5942ce0..4c74c639c35bc7 100644 --- a/be/src/vec/exec/format/parquet/decoder.h +++ b/be/src/vec/exec/format/parquet/decoder.h @@ -32,6 +32,7 @@ #include "util/slice.h" #include "vec/columns/column.h" #include "vec/columns/column_dictionary.h" +#include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" @@ -43,15 +44,6 @@ #include "vec/exec/format/format_common.h" #include "vec/exec/format/parquet/parquet_common.h" -namespace cctz { -class time_zone; -} // namespace cctz -namespace doris { -namespace vectorized { -class ColumnString; -} // namespace vectorized -} // namespace doris - namespace doris::vectorized { class Decoder { diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h index c8a49e098a53e9..eb97ab0c885bf3 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h @@ -29,6 +29,7 @@ #include "decoder.h" #include "level_decoder.h" #include "util/slice.h" +#include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" #include "vec/data_types/data_type.h" #include "vec/exec/format/parquet/parquet_common.h" @@ -45,7 +46,6 @@ class BufferedStreamReader; struct IOContext; } // namespace io namespace vectorized { -class ColumnString; struct FieldSchema; } // namespace vectorized } // namespace doris diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index ec6f85dabfd986..6de7c06a7986f7 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -47,9 +47,6 @@ namespace doris { namespace io { struct IOContext; } // namespace io -namespace vectorized { -class ColumnString; -} // namespace vectorized } // namespace doris namespace doris::vectorized { diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h index 7d5a425edfb076..a3a2815e1af840 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h @@ -31,6 +31,7 @@ #include "io/fs/buffered_reader.h" #include "io/fs/file_reader_writer_fwd.h" #include "parquet_column_convert.h" +#include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" #include "vec/data_types/data_type.h" #include "vec/exec/format/parquet/parquet_common.h" @@ -44,7 +45,6 @@ namespace io { struct IOContext; } // namespace io namespace vectorized { -class ColumnString; struct FieldSchema; } // namespace vectorized } // namespace doris diff --git a/be/src/vec/exec/format/table/iceberg_reader.h b/be/src/vec/exec/format/table/iceberg_reader.h index 2c6dab877deb98..b5050666d6e683 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.h +++ b/be/src/vec/exec/format/table/iceberg_reader.h @@ -32,6 +32,7 @@ #include "table_format_reader.h" #include "util/runtime_profile.h" #include "vec/columns/column_dictionary.h" +#include "vec/columns/column_string.h" namespace tparquet { class KeyValue; @@ -53,7 +54,6 @@ struct TypeDescriptor; namespace vectorized { class Block; -class ColumnString; class GenericReader; class ShardedKVCache; class VExprContext; diff --git a/be/src/vec/exprs/table_function/vexplode_split.h b/be/src/vec/exprs/table_function/vexplode_split.h index 343f83406cc123..8223e739cba8da 100644 --- a/be/src/vec/exprs/table_function/vexplode_split.h +++ b/be/src/vec/exprs/table_function/vexplode_split.h @@ -24,6 +24,7 @@ #include #include "common/status.h" +#include "vec/columns/column_string.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type.h" #include "vec/exprs/table_function/table_function.h" @@ -31,7 +32,6 @@ namespace doris { namespace vectorized { class Block; -class ColumnString; } // namespace vectorized } // namespace doris diff --git a/be/src/vec/functions/array/function_arrays_overlap.h b/be/src/vec/functions/array/function_arrays_overlap.h index d1bca0ec02e2b4..b46c874f7f9917 100644 --- a/be/src/vec/functions/array/function_arrays_overlap.h +++ b/be/src/vec/functions/array/function_arrays_overlap.h @@ -31,6 +31,7 @@ #include "common/status.h" #include "vec/columns/column.h" #include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" @@ -50,10 +51,6 @@ namespace doris { class FunctionContext; - -namespace vectorized { -class ColumnString; -} // namespace vectorized } // namespace doris template struct DefaultHash; diff --git a/be/src/vec/functions/in.h b/be/src/vec/functions/in.h index 1bfbf7eb2d5b38..860df856be7ae0 100644 --- a/be/src/vec/functions/in.h +++ b/be/src/vec/functions/in.h @@ -37,6 +37,7 @@ #include "vec/columns/column.h" #include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/columns/columns_number.h" #include "vec/common/string_ref.h" @@ -49,11 +50,7 @@ #include "vec/data_types/data_type_number.h" #include "vec/functions/function.h" -namespace doris { -namespace vectorized { -class ColumnString; -} // namespace vectorized -} // namespace doris +namespace doris {} // namespace doris namespace doris::vectorized { diff --git a/be/src/vec/json/parse2column.h b/be/src/vec/json/parse2column.h index 9f8291d83a5a6d..9df36bef283197 100644 --- a/be/src/vec/json/parse2column.h +++ b/be/src/vec/json/parse2column.h @@ -21,11 +21,11 @@ #include #include "vec/columns/column.h" +#include "vec/columns/column_string.h" #include "vec/common/string_ref.h" namespace doris { namespace vectorized { -class ColumnString; class SimdJSONParser; enum class ExtractType; template diff --git a/be/src/vec/jsonb/serialize.h b/be/src/vec/jsonb/serialize.h index 1442792ebf4ed4..c8f9e51d041d08 100644 --- a/be/src/vec/jsonb/serialize.h +++ b/be/src/vec/jsonb/serialize.h @@ -28,10 +28,6 @@ namespace doris { class TabletSchema; class TupleDescriptor; - -namespace vectorized { -class ColumnString; -} // namespace vectorized } // namespace doris namespace doris::vectorized { From 2bcca5f0dd8939b201fd11e0f1102cbb911063cb Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 00:01:20 +0800 Subject: [PATCH 02/10] add col str 64 --- be/src/vec/columns/column_string.cpp | 89 ++++++++++++++++------------ be/src/vec/columns/column_string.h | 1 + be/src/vec/core/sort_block.h | 14 ++++- 3 files changed, 63 insertions(+), 41 deletions(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 10593bd031fcef..19c74db070ee00 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -191,14 +191,17 @@ ColumnPtr ColumnStr::filter(const IColumn::Filter& filt, ssize_t result_size_ return ColumnStr::create(); } - auto res = ColumnStr::create(); - - Chars& res_chars = res->chars; - IColumn::Offsets& res_offsets = res->offsets; + if constexpr (std::is_same_v) { + auto res = ColumnStr::create(); + Chars &res_chars = res->chars; + IColumn::Offsets &res_offsets = res->offsets; - filter_arrays_impl(chars, offsets, res_chars, res_offsets, filt, - result_size_hint); - return res; + filter_arrays_impl(chars, offsets, res_chars, res_offsets, filt, + result_size_hint); + return res; + } else { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "should not call filter in ColumnStr"); + } } template @@ -209,22 +212,30 @@ size_t ColumnStr::filter(const IColumn::Filter& filter) { return 0; } - return filter_arrays_impl(chars, offsets, filter); + if constexpr (std::is_same_v) { + return filter_arrays_impl(chars, offsets, filter); + } else { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "should not call filter in ColumnStr"); + } } template Status ColumnStr::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { - auto* col = static_cast*>(col_ptr); - Chars& res_chars = col->chars; - IColumn::Offsets& res_offsets = col->offsets; - IColumn::Filter filter; - filter.resize_fill(offsets.size(), 0); - for (size_t i = 0; i < sel_size; i++) { - filter[sel[i]] = 1; - } - filter_arrays_impl(chars, offsets, res_chars, res_offsets, filter, - sel_size); - return Status::OK(); + if constexpr (std::is_same_v) { + auto *col = static_cast *>(col_ptr); + Chars &res_chars = col->chars; + IColumn::Offsets &res_offsets = col->offsets; + IColumn::Filter filter; + filter.resize_fill(offsets.size(), 0); + for (size_t i = 0; i < sel_size; i++) { + filter[sel[i]] = 1; + } + filter_arrays_impl(chars, offsets, res_chars, res_offsets, filter, + sel_size); + return Status::OK(); + } else { + return Status::InternalError("should not call filter_by_selector in ColumnStr"); + } } template @@ -248,7 +259,7 @@ ColumnPtr ColumnStr::permute(const IColumn::Permutation& perm, size_t limit) auto res = ColumnStr::create(); Chars& res_chars = res->chars; - IColumn::Offsets& res_offsets = res->offsets; + auto& res_offsets = res->offsets; if (limit == size) { res_chars.resize(chars.size()); @@ -262,7 +273,7 @@ ColumnPtr ColumnStr::permute(const IColumn::Permutation& perm, size_t limit) res_offsets.resize(limit); - IColumn::Offset current_new_offset = 0; + T current_new_offset = 0; for (size_t i = 0; i < limit; ++i) { size_t j = perm[i]; @@ -384,7 +395,7 @@ ColumnPtr ColumnStr::index_impl(const PaddedPODArray& indexes, size_t l auto res = ColumnStr::create(); Chars& res_chars = res->chars; - IColumn::Offsets& res_offsets = res->offsets; + auto& res_offsets = res->offsets; size_t new_chars_size = 0; for (size_t i = 0; i < limit; ++i) { @@ -395,7 +406,7 @@ ColumnPtr ColumnStr::index_impl(const PaddedPODArray& indexes, size_t l res_offsets.resize(limit); - IColumn::Offset current_new_offset = 0; + T current_new_offset = 0; for (size_t i = 0; i < limit; ++i) { size_t j = indexes[i]; @@ -466,13 +477,13 @@ ColumnPtr ColumnStr::replicate(const IColumn::Offsets& replicate_offsets) con } Chars& res_chars = res->chars; - IColumn::Offsets& res_offsets = res->offsets; + auto& res_offsets = res->offsets; res_chars.reserve(chars.size() / col_size * replicate_offsets.back()); res_offsets.reserve(replicate_offsets.back()); - IColumn::Offset prev_replicate_offset = 0; - IColumn::Offset prev_string_offset = 0; - IColumn::Offset current_new_offset = 0; + T prev_replicate_offset = 0; + T prev_string_offset = 0; + T current_new_offset = 0; for (size_t i = 0; i < col_size; ++i) { size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; @@ -556,18 +567,18 @@ ColumnPtr ColumnStr::convert_to_full_column_if_overflow() { const auto length = offsets.size(); std::swap(new_col->get_chars(), chars); new_col->get_offsets().resize(length); - - size_t base = 0; - size_t start = 0; - size_t end = length; - - while (start < end) { - size_t mid = find_first_overflow_point(_offsets, start, end); - for (size_t i = start; i < mid; i++) { - new_column->get_offset()[i] = static_cast(_offsets[i]) + base; - } - base += Column::MAX_CAPACITY_LIMIT; - start = mid; + auto& large_offsets = new_col->get_offsets(); + + size_t loc = 0; + // TODO: recheck to SIMD the code + // if offset overflow. will be lower than offsets[loc - 1] + while (offsets[loc] >= offsets[loc - 1] && loc < length) { + large_offsets[loc] = offsets[loc]; + loc++; + } + while (loc < length) { + large_offsets[loc] = (offsets[loc] - offsets[loc - 1]) + large_offsets[loc - 1]; + loc++; } offsets.clear(); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index aed9354f8dc318..c9593cf9bf0b94 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -586,4 +586,5 @@ class ColumnStr final : public COWHelper> { }; using ColumnString = ColumnStr; +using ColumnString64 = ColumnStr; } // namespace doris::vectorized diff --git a/be/src/vec/core/sort_block.h b/be/src/vec/core/sort_block.h index ab14c34422d11b..00724e9f873282 100644 --- a/be/src/vec/core/sort_block.h +++ b/be/src/vec/core/sort_block.h @@ -223,6 +223,15 @@ class ColumnSorter { } } + void sort_column(const ColumnString64& column, EqualFlags& flags, IColumn::Permutation& perms, + EqualRange& range, bool last_column) const { + if (!_should_inline_value(perms)) { + _sort_by_default(column, flags, perms, range, last_column); + } else { + _sort_by_inlined_permutation(column, flags, perms, range, last_column); + } + } + void sort_column(const ColumnNullable& column, EqualFlags& flags, IColumn::Permutation& perms, EqualRange& range, bool last_column) const { if (!column.has_null()) { @@ -324,7 +333,7 @@ class ColumnSorter { if constexpr (std::is_same_v> || std::is_same_v>) { permutation_for_column[i].inline_value = column.get_data()[row_id]; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v || std::is_same_v) { permutation_for_column[i].inline_value = column.get_data_at(row_id); } else { static_assert(always_false_v); @@ -338,7 +347,8 @@ class ColumnSorter { EqualRange& range, bool last_column) const { int new_limit = _limit; auto comparator = [&](const size_t a, const size_t b) { - if constexpr (!std::is_same_v) { + if constexpr (!std::is_same_v && + !std::is_same_v) { auto value_a = column.get_data()[a]; auto value_b = column.get_data()[b]; return value_a > value_b ? 1 : (value_a < value_b ? -1 : 0); From fe89493a506c89f9d2463316e2470ece8057104c Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 00:09:04 +0800 Subject: [PATCH 03/10] save code except column string --- be/src/pipeline/exec/hashjoin_build_sink.cpp | 3 +- be/src/vec/columns/column.h | 16 ++++++--- be/src/vec/columns/column_array.cpp | 34 ++++++++++++++++++ be/src/vec/columns/column_array.h | 9 +++++ be/src/vec/columns/column_map.cpp | 36 ++++++++++++++++++++ be/src/vec/columns/column_map.h | 8 +++++ be/src/vec/columns/column_nullable.cpp | 11 ++++++ be/src/vec/columns/column_nullable.h | 9 ++++- be/src/vec/columns/column_struct.cpp | 9 +++++ be/src/vec/columns/column_struct.h | 11 +++++- be/src/vec/core/block.h | 31 +++++++++++++++++ be/src/vec/exec/join/vhash_join_node.cpp | 3 +- 12 files changed, 172 insertions(+), 8 deletions(-) diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index d4dc1956400f03..8b897528e5035c 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -253,6 +253,7 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, return Status::OK(); } COUNTER_UPDATE(_build_rows_counter, rows); + block.replace_if_overflow(); vectorized::ColumnRawPtrs raw_ptrs(_build_expr_ctxs.size()); @@ -519,7 +520,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* res_col_ids)); SCOPED_TIMER(local_state._build_side_merge_block_timer); - RETURN_IF_ERROR(local_state._build_side_mutable_block.merge(*in_block)); + RETURN_IF_ERROR(local_state._build_side_mutable_block.merge_ignore_overflow(*in_block)); COUNTER_UPDATE(local_state._build_blocks_memory_usage, in_block->bytes()); local_state._mem_tracker->consume(in_block->bytes()); if (local_state._build_side_mutable_block.rows() > diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index a03b30375dd59a..c1f4371ca1eb72 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -99,10 +99,10 @@ class IColumn : public COW { */ virtual Ptr convert_to_full_column_if_const() const { return get_ptr(); } - /** If column isn't constant, returns nullptr (or itself). - * If column is constant, transforms constant to full column (if column type allows such transform) and return it. - */ - virtual Ptr convert_to_full_column_if_overflow() { return get_ptr(); } + /** If in join. the StringColumn size may overflow uint32_t, we need convert to uint64_t to ColumnLargeStringForJoin + * The Column: ColumnString, ColumnNullable, ColumnArray, ColumnStruct need impl the code + */ + virtual Ptr convert_column_if_overflow() { return get_ptr(); } /// If column isn't ColumnLowCardinality, return itself. /// If column is ColumnLowCardinality, transforms is to full column. @@ -226,6 +226,14 @@ class IColumn : public COW { /// TODO: we need `insert_range_from_const` for every column type. virtual void insert_range_from(const IColumn& src, size_t start, size_t length) = 0; + /// Appends range of elements from other column with the same type. + /// Do not need throw execption in ColumnString overflow uint32, only + /// use in join + virtual void insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) { + insert_range_from(src, start, length); + } + /// Appends one element from other column with the same type multiple times. virtual void insert_many_from(const IColumn& src, size_t position, size_t length) { for (size_t i = 0; i < length; ++i) { diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index 442ffd44422707..f8eb146bdb5bf5 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -514,6 +514,40 @@ void ColumnArray::insert_range_from(const IColumn& src, size_t start, size_t len } } +void ColumnArray::insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) { + const ColumnArray& src_concrete = assert_cast(src); + + if (start + length > src_concrete.get_offsets().size()) { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "Parameter out of bound in ColumnArray::insert_range_from method. " + "[start({}) + length({}) > offsets.size({})]", + std::to_string(start), std::to_string(length), + std::to_string(src_concrete.get_offsets().size())); + } + + size_t nested_offset = src_concrete.offset_at(start); + size_t nested_length = src_concrete.get_offsets()[start + length - 1] - nested_offset; + + get_data().insert_range_from_ignore_overflow(src_concrete.get_data(), nested_offset, + nested_length); + + auto& cur_offsets = get_offsets(); + const auto& src_offsets = src_concrete.get_offsets(); + + if (start == 0 && cur_offsets.empty()) { + cur_offsets.assign(src_offsets.begin(), src_offsets.begin() + length); + } else { + size_t old_size = cur_offsets.size(); + // -1 is ok, because PaddedPODArray pads zeros on the left. + size_t prev_max_offset = cur_offsets.back(); + cur_offsets.resize(old_size + length); + + for (size_t i = 0; i < length; ++i) + cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset; + } +} + double ColumnArray::get_ratio_of_default_rows(double sample_ratio) const { return get_ratio_of_default_rows_impl(sample_ratio); } diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index 7d619c14efffb7..2d86bb8c546db5 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -85,6 +85,8 @@ class ColumnArray final : public COWHelper { ColumnArray(const ColumnArray&) = default; + ColumnArray() = default; + public: // offsets of array is 64bit wise using Offset64 = IColumn::Offset64; @@ -152,6 +154,8 @@ class ColumnArray final : public COWHelper { const uint8_t* __restrict null_data = nullptr) const override; void insert_range_from(const IColumn& src, size_t start, size_t length) override; + void insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) override; void insert(const Field& x) override; void insert_from(const IColumn& src_, size_t n) override; void insert_default() override; @@ -217,6 +221,11 @@ class ColumnArray final : public COWHelper { callback(data); } + ColumnPtr convert_column_if_overflow() override { + data = data->convert_column_if_overflow(); + return IColumn::convert_column_if_overflow(); + } + void insert_indices_from(const IColumn& src, const uint32_t* indices_begin, const uint32_t* indices_end) override; diff --git a/be/src/vec/columns/column_map.cpp b/be/src/vec/columns/column_map.cpp index 9303d628933c3e..ad29997b97451a 100644 --- a/be/src/vec/columns/column_map.cpp +++ b/be/src/vec/columns/column_map.cpp @@ -407,6 +407,42 @@ void ColumnMap::insert_range_from(const IColumn& src, size_t start, size_t lengt } } +void ColumnMap::insert_range_from_ignore_overflow(const IColumn& src, size_t start, size_t length) { + const ColumnMap& src_concrete = assert_cast(src); + + if (start + length > src_concrete.size()) { + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "Parameter out of bound in ColumnMap::insert_range_from method. " + "[start({}) + length({}) > offsets.size({})]", + std::to_string(start), std::to_string(length), + std::to_string(src_concrete.size())); + } + + size_t nested_offset = src_concrete.offset_at(start); + size_t nested_length = src_concrete.offset_at(start + length) - nested_offset; + + keys_column->insert_range_from_ignore_overflow(src_concrete.get_keys(), nested_offset, + nested_length); + values_column->insert_range_from_ignore_overflow(src_concrete.get_values(), nested_offset, + nested_length); + + auto& cur_offsets = get_offsets(); + const auto& src_offsets = src_concrete.get_offsets(); + + if (start == 0 && cur_offsets.empty()) { + cur_offsets.assign(src_offsets.begin(), src_offsets.begin() + length); + } else { + size_t old_size = cur_offsets.size(); + // -1 is ok, because PaddedPODArray pads zeros on the left. + size_t prev_max_offset = cur_offsets.back(); + cur_offsets.resize(old_size + length); + + for (size_t i = 0; i < length; ++i) { + cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset; + } + } +} + ColumnPtr ColumnMap::filter(const Filter& filt, ssize_t result_size_hint) const { auto k_arr = ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable()) diff --git a/be/src/vec/columns/column_map.h b/be/src/vec/columns/column_map.h index e0bc7e72d7826e..73c13edd90a133 100644 --- a/be/src/vec/columns/column_map.h +++ b/be/src/vec/columns/column_map.h @@ -102,6 +102,8 @@ class ColumnMap final : public COWHelper { void insert_data(const char* pos, size_t length) override; void insert_range_from(const IColumn& src, size_t start, size_t length) override; + void insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) override; void insert_from(const IColumn& src_, size_t n) override; void insert(const Field& x) override; void insert_default() override; @@ -205,6 +207,12 @@ class ColumnMap final : public COWHelper { return get_offsets()[i] - get_offsets()[i - 1]; } + ColumnPtr convert_column_if_overflow() override { + keys_column = keys_column->convert_column_if_overflow(); + values_column = values_column->convert_column_if_overflow(); + return IColumn::convert_column_if_overflow(); + } + private: friend class COWHelper; diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index dc8853e49af1ee..0a4321eea53c0a 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -283,6 +283,17 @@ void ColumnNullable::deserialize_vec(std::vector& keys, const size_t } } +void ColumnNullable::insert_range_from_ignore_overflow(const doris::vectorized::IColumn& src, + size_t start, size_t length) { + const auto& nullable_col = assert_cast(src); + _get_null_map_column().insert_range_from(*nullable_col.null_map, start, length); + get_nested_column().insert_range_from_ignore_overflow(*nullable_col.nested_column, start, + length); + const auto& src_null_map_data = nullable_col.get_null_map_data(); + _has_null = has_null(); + _has_null |= simd::contain_byte(src_null_map_data.data() + start, length, 1); +} + void ColumnNullable::insert_range_from(const IColumn& src, size_t start, size_t length) { const auto& nullable_col = assert_cast(src); _get_null_map_column().insert_range_from(*nullable_col.null_map, start, length); diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 0b78330949974a..7662c23e036f94 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -121,6 +121,10 @@ class ColumnNullable final : public COWHelper { void deserialize_vec(std::vector& keys, size_t num_rows) override; void insert_range_from(const IColumn& src, size_t start, size_t length) override; + + void insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) override; + void insert_indices_from(const IColumn& src, const uint32_t* indices_begin, const uint32_t* indices_end) override; void insert_indices_from_not_has_null(const IColumn& src, const uint32_t* indices_begin, @@ -237,7 +241,10 @@ class ColumnNullable final : public COWHelper { append_data_by_selector_impl(res, selector, begin, end); } - // void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr convert_column_if_overflow() override { + nested_column = nested_column->convert_column_if_overflow(); + return get_ptr(); + } void for_each_subcolumn(ColumnCallback callback) override { callback(nested_column); diff --git a/be/src/vec/columns/column_struct.cpp b/be/src/vec/columns/column_struct.cpp index d075b040e6d4d3..c0f2b3cbb7ad6d 100644 --- a/be/src/vec/columns/column_struct.cpp +++ b/be/src/vec/columns/column_struct.cpp @@ -251,6 +251,15 @@ void ColumnStruct::insert_range_from(const IColumn& src, size_t start, size_t le } } +void ColumnStruct::insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) { + const size_t tuple_size = columns.size(); + for (size_t i = 0; i < tuple_size; ++i) { + columns[i]->insert_range_from_ignore_overflow( + *assert_cast(src).columns[i], start, length); + } +} + ColumnPtr ColumnStruct::filter(const Filter& filt, ssize_t result_size_hint) const { const size_t tuple_size = columns.size(); Columns new_columns(tuple_size); diff --git a/be/src/vec/columns/column_struct.h b/be/src/vec/columns/column_struct.h index 5157b1ad6b09e5..51cf35ff406a11 100644 --- a/be/src/vec/columns/column_struct.h +++ b/be/src/vec/columns/column_struct.h @@ -145,6 +145,8 @@ class ColumnStruct final : public COWHelper { } void insert_range_from(const IColumn& src, size_t start, size_t length) override; + void insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) override; ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override; size_t filter(const Filter& filter) override; ColumnPtr permute(const Permutation& perm, size_t limit) const override; @@ -175,10 +177,17 @@ class ColumnStruct final : public COWHelper { ColumnPtr& get_column_ptr(size_t idx) { return columns[idx]; } void clear() override { - for (auto col : columns) { + for (auto& col : columns) { col->clear(); } } + + ColumnPtr convert_column_if_overflow() override { + for (auto& col : columns) { + col = col->convert_column_if_overflow(); + } + return IColumn::convert_column_if_overflow(); + } }; } // namespace doris::vectorized diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index d6567de0a44211..9fea5242d9baff 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -146,6 +146,12 @@ class Block { element.column = element.column->convert_to_full_column_if_const(); } + void replace_if_overflow() { + for (auto& ele : data) { + ele.column = std::move(*ele.column).mutate()->convert_column_if_overflow(); + } + } + ColumnWithTypeAndName& safe_get_by_position(size_t position); const ColumnWithTypeAndName& safe_get_by_position(size_t position) const; @@ -515,6 +521,31 @@ class MutableBlock { RETURN_IF_CATCH_EXCEPTION(return merge_impl(block);); } + template + [[nodiscard]] Status merge_ignore_overflow(T&& block) { + RETURN_IF_CATCH_EXCEPTION(return merge_impl_ignore_overflow(block);); + } + + // only use for join. call ignore_overflow to prevent from throw exception in join + template + [[nodiscard]] Status merge_impl_ignore_overflow(T&& block) { + if (_columns.size() != block.columns()) { + return Status::Error( + "Merge block not match, self:[columns: {}, types: {}], input:[columns: {}, " + "types: {}], ", + dump_names(), dump_types(), block.dump_names(), block.dump_types()); + } + for (int i = 0; i < _columns.size(); ++i) { + DCHECK(_data_types[i]->equals(*block.get_by_position(i).type)) + << " target type: " << _data_types[i]->get_name() + << " src type: " << block.get_by_position(i).type->get_name(); + _columns[i]->insert_range_from_ignore_overflow( + *block.get_by_position(i).column->convert_to_full_column_if_const().get(), 0, + block.rows()); + } + return Status::OK(); + } + template [[nodiscard]] Status merge_impl(T&& block) { // merge is not supported in dynamic block diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index b49e74ce3f19e0..9fec942d1619f9 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -744,7 +744,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc res_col_ids)); SCOPED_TIMER(_build_side_merge_block_timer); - RETURN_IF_ERROR(_build_side_mutable_block.merge(*in_block)); + RETURN_IF_ERROR(_build_side_mutable_block.merge_ignore_overflow(*in_block)); if (_build_side_mutable_block.rows() > JOIN_BUILD_SIZE_LIMIT) { return Status::NotSupported( "Hash join do not support build table rows" @@ -942,6 +942,7 @@ Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block) { SCOPED_TIMER(_build_table_timer); size_t rows = block.rows(); COUNTER_UPDATE(_build_rows_counter, rows); + block.replace_if_overflow(); ColumnRawPtrs raw_ptrs(_build_expr_ctxs.size()); From 719a6d1be5a331bfd2794ef12f00cb805f97b86f Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 00:29:27 +0800 Subject: [PATCH 04/10] support the columnstring overflow --- be/src/vec/columns/column_string.cpp | 20 +++++++++++--------- be/src/vec/columns/column_string.h | 9 ++++----- be/src/vec/core/sort_block.h | 3 ++- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 19c74db070ee00..386216fc16e2f5 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -193,14 +193,15 @@ ColumnPtr ColumnStr::filter(const IColumn::Filter& filt, ssize_t result_size_ if constexpr (std::is_same_v) { auto res = ColumnStr::create(); - Chars &res_chars = res->chars; - IColumn::Offsets &res_offsets = res->offsets; + Chars& res_chars = res->chars; + IColumn::Offsets& res_offsets = res->offsets; filter_arrays_impl(chars, offsets, res_chars, res_offsets, filt, result_size_hint); return res; } else { - throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "should not call filter in ColumnStr"); + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "should not call filter in ColumnStr"); } } @@ -215,16 +216,17 @@ size_t ColumnStr::filter(const IColumn::Filter& filter) { if constexpr (std::is_same_v) { return filter_arrays_impl(chars, offsets, filter); } else { - throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "should not call filter in ColumnStr"); + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "should not call filter in ColumnStr"); } } template Status ColumnStr::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { if constexpr (std::is_same_v) { - auto *col = static_cast *>(col_ptr); - Chars &res_chars = col->chars; - IColumn::Offsets &res_offsets = col->offsets; + auto* col = static_cast*>(col_ptr); + Chars& res_chars = col->chars; + IColumn::Offsets& res_offsets = col->offsets; IColumn::Filter filter; filter.resize_fill(offsets.size(), 0); for (size_t i = 0; i < sel_size; i++) { @@ -560,8 +562,8 @@ ColumnPtr ColumnStr::index(const IColumn& indexes, size_t limit) const { } template -ColumnPtr ColumnStr::convert_to_full_column_if_overflow() { - if (std::is_same_v && chars.size() > std::numeric_limits::max()) { +ColumnPtr ColumnStr::convert_column_if_overflow() { + if (std::is_same_v && chars.size() > 10) { auto new_col = ColumnStr::create(); const auto length = offsets.size(); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index c9593cf9bf0b94..e0c3caabe3a21e 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -523,9 +523,8 @@ class ColumnStr final : public COWHelper> { void append_data_by_selector(MutableColumnPtr& res, const IColumn::Selector& selector, size_t begin, size_t end) const override { - append_data_by_selector_impl(res, selector, begin, end); + this->template append_data_by_selector_impl>(res, selector, begin, end); } - // void gather(ColumnGathererStream & gatherer_stream) override; void reserve(size_t n) override; @@ -549,12 +548,12 @@ class ColumnStr final : public COWHelper> { } void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 0) override { - LOG(FATAL) << "Method replace_column_data is not supported for " << get_name(); + LOG(FATAL) << "Method replace_column_data is not supported for ColumnString"; } // should replace according to 0,1,2... ,size,0,1,2... void replace_column_data_default(size_t self_row = 0) override { - LOG(FATAL) << "Method replace_column_data_default is not supported for " << get_name(); + LOG(FATAL) << "Method replace_column_data_default is not supported for ColumnString"; } void compare_internal(size_t rhs_row_id, const IColumn& rhs, int nan_direction_hint, @@ -582,7 +581,7 @@ class ColumnStr final : public COWHelper> { return this->template get_ratio_of_default_rows_impl>(sample_ratio); } - ColumnPtr convert_to_full_column_if_overflow() override; + ColumnPtr convert_column_if_overflow() override; }; using ColumnString = ColumnStr; diff --git a/be/src/vec/core/sort_block.h b/be/src/vec/core/sort_block.h index 00724e9f873282..e306cf539836d4 100644 --- a/be/src/vec/core/sort_block.h +++ b/be/src/vec/core/sort_block.h @@ -333,7 +333,8 @@ class ColumnSorter { if constexpr (std::is_same_v> || std::is_same_v>) { permutation_for_column[i].inline_value = column.get_data()[row_id]; - } else if constexpr (std::is_same_v || std::is_same_v) { + } else if constexpr (std::is_same_v || + std::is_same_v) { permutation_for_column[i].inline_value = column.get_data_at(row_id); } else { static_assert(always_false_v); From 26833e0cd1968d5ad7d4efec47d689acfe55b94e Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 02:53:04 +0800 Subject: [PATCH 05/10] fix copy build column core --- be/src/vec/columns/column.h | 4 +- be/src/vec/columns/column_string.cpp | 59 ++++++++++++++++------------ be/src/vec/columns/column_string.h | 2 + 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index c1f4371ca1eb72..4c1631ba339a7f 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -652,6 +652,8 @@ class IColumn : public COW { virtual bool is_column_string() const { return false; } + virtual bool is_column_string64() const { return false; } + virtual bool is_column_decimal() const { return false; } virtual bool is_column_dictionary() const { return false; } @@ -665,8 +667,6 @@ class IColumn : public COW { /// If the only value column can contain is NULL. virtual bool only_null() const { return false; } - virtual bool low_cardinality() const { return false; } - virtual void sort_column(const ColumnSorter* sorter, EqualFlags& flags, IColumn::Permutation& perms, EqualRange& range, bool last_column) const; diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 386216fc16e2f5..d336615044bed6 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -132,34 +132,43 @@ void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t le template void ColumnStr::insert_indices_from(const IColumn& src, const uint32_t* indices_begin, const uint32_t* indices_end) { - const auto& src_str = assert_cast&>(src); - const auto* src_offset_data = src_str.offsets.data(); + auto do_insert = [&](const auto& src_str) { + const auto* __restrict src_offset_data = src_str.get_offsets().data(); - auto old_char_size = chars.size(); - size_t total_chars_size = old_char_size; + auto old_char_size = chars.size(); + size_t total_chars_size = old_char_size; - auto dst_offsets_pos = offsets.size(); - offsets.resize(offsets.size() + indices_end - indices_begin); - auto* dst_offsets_data = offsets.data(); + auto dst_offsets_pos = offsets.size(); + offsets.resize(offsets.size() + indices_end - indices_begin); + auto* dst_offsets_data = offsets.data(); - for (const auto* x = indices_begin; x != indices_end; ++x) { - total_chars_size += src_offset_data[*x] - src_offset_data[int(*x) - 1]; - dst_offsets_data[dst_offsets_pos++] = total_chars_size; - } - check_chars_length(total_chars_size, offsets.size()); - - chars.resize(total_chars_size); - - const auto* src_data_ptr = src_str.chars.data(); - auto* dst_data_ptr = chars.data(); - - size_t dst_chars_pos = old_char_size; - for (const auto* x = indices_begin; x != indices_end; ++x) { - const size_t size_to_append = src_offset_data[*x] - src_offset_data[int(*x) - 1]; - const size_t offset = src_offset_data[int(*x) - 1]; - memcpy_small_allow_read_write_overflow15(dst_data_ptr + dst_chars_pos, - src_data_ptr + offset, size_to_append); - dst_chars_pos += size_to_append; + for (const auto* x = indices_begin; x != indices_end; ++x) { + int64_t src_offset = *x; + total_chars_size += src_offset_data[src_offset] - src_offset_data[src_offset - 1]; + dst_offsets_data[dst_offsets_pos++] = total_chars_size; + } + check_chars_length(total_chars_size, offsets.size()); + + chars.resize(total_chars_size); + + const auto* __restrict src_data_ptr = src_str.get_chars().data(); + auto* dst_data_ptr = chars.data(); + + size_t dst_chars_pos = old_char_size; + for (const auto* x = indices_begin; x != indices_end; ++x) { + int64_t src_offset = *x; + const size_t size_to_append = + src_offset_data[src_offset] - src_offset_data[src_offset - 1]; + const size_t offset = src_offset_data[src_offset - 1]; + memcpy_small_allow_read_write_overflow15(dst_data_ptr + dst_chars_pos, + src_data_ptr + offset, size_to_append); + dst_chars_pos += size_to_append; + } + }; + if (src.is_column_string64()) { + do_insert(assert_cast&>(src)); + } else { + do_insert(assert_cast&>(src)); } } diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index e0c3caabe3a21e..a06ff062550686 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -164,6 +164,8 @@ class ColumnStr final : public COWHelper> { offsets.push_back(new_size); } + bool is_column_string64() const override { return sizeof(T) == sizeof(uint64_t); } + void insert_from(const IColumn& src_, size_t n) override { const ColumnStr& src = assert_cast&>(src_); const size_t size_to_append = From eadfde7b23619fc03ea6aefcb14d5f41ed942087 Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 10:53:21 +0800 Subject: [PATCH 06/10] fix rf failed --- be/src/exprs/bloom_filter_func.h | 37 ++++++++++------ be/src/exprs/hybrid_set.h | 74 +++++++++++++++++++++----------- be/src/exprs/minmax_predicate.h | 41 ++++++++++-------- 3 files changed, 99 insertions(+), 53 deletions(-) diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index 10d30212ff8f0a..987c2b0c05d9b7 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -379,25 +379,38 @@ struct CommonFindOp { struct StringFindOp : CommonFindOp { static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, size_t start) { + auto _insert_batch_col_str = [&](const auto& col, const uint8_t* __restrict nullmap, + size_t start, size_t size) { + for (size_t i = start; i < size; i++) { + if (nullmap == nullptr || !nullmap[i]) { + bloom_filter.add_element(col.get_data_at(i)); + } else { + bloom_filter.set_contain_null(); + } + } + }; + if (column->is_nullable()) { const auto* nullable = assert_cast(column.get()); - const auto& col = - assert_cast(nullable->get_nested_column()); const auto& nullmap = assert_cast(nullable->get_null_map_column()) .get_data(); - - for (size_t i = start; i < col.size(); i++) { - if (!nullmap[i]) { - bloom_filter.add_element(col.get_data_at(i)); - } else { - bloom_filter.set_contain_null(); - } + if (nullable->get_nested_column().is_column_string64()) { + _insert_batch_col_str(assert_cast( + nullable->get_nested_column()), + nullmap.data(), start, nullmap.size()); + } else { + _insert_batch_col_str( + assert_cast(nullable->get_nested_column()), + nullmap.data(), start, nullmap.size()); } } else { - const auto& col = assert_cast(column.get()); - for (size_t i = start; i < col->size(); i++) { - bloom_filter.add_element(col->get_data_at(i)); + if (column->is_column_string64()) { + _insert_batch_col_str(assert_cast(*column), + nullptr, start, column->size()); + } else { + _insert_batch_col_str(assert_cast(*column), + nullptr, start, column->size()); } } } diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h index ba5fabe509be62..03cde80c24bf13 100644 --- a/be/src/exprs/hybrid_set.h +++ b/be/src/exprs/hybrid_set.h @@ -418,26 +418,39 @@ class StringSet : public HybridSetBase { } } + void _insert_fixed_len_string(const auto& col, const uint8_t* __restrict nullmap, size_t start, + size_t end) { + for (size_t i = start; i < end; i++) { + if (nullmap != nullptr || !nullmap[i]) { + _set.insert(col.get_data_at(i).to_string()); + } else { + _contains_null = true; + } + } + } + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { if (column->is_nullable()) { const auto* nullable = assert_cast(column.get()); - const auto& col = - assert_cast(nullable->get_nested_column()); const auto& nullmap = assert_cast(nullable->get_null_map_column()) .get_data(); - - for (size_t i = start; i < nullable->size(); i++) { - if (!nullmap[i]) { - _set.insert(col.get_data_at(i).to_string()); - } else { - _contains_null = true; - } + if (nullable->get_nested_column().is_column_string64()) { + _insert_fixed_len_string(assert_cast( + nullable->get_nested_column()), + nullmap.data(), start, nullmap.size()); + } else { + _insert_fixed_len_string( + assert_cast(nullable->get_nested_column()), + nullmap.data(), start, nullmap.size()); } } else { - const auto& col = assert_cast(column.get()); - for (size_t i = start; i < col->size(); i++) { - _set.insert(col->get_data_at(i).to_string()); + if (column->is_column_string64()) { + _insert_fixed_len_string(assert_cast(*column), + nullptr, start, column->size()); + } else { + _insert_fixed_len_string(assert_cast(*column), + nullptr, start, column->size()); } } } @@ -567,26 +580,39 @@ class StringValueSet : public HybridSetBase { } } + void _insert_fixed_len_string(const auto& col, const uint8_t* __restrict nullmap, size_t start, + size_t end) { + for (size_t i = start; i < end; i++) { + if (nullmap != nullptr || !nullmap[i]) { + _set.insert(col.get_data_at(i)); + } else { + _contains_null = true; + } + } + } + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { if (column->is_nullable()) { const auto* nullable = assert_cast(column.get()); - const auto& col = - assert_cast(nullable->get_nested_column()); const auto& nullmap = assert_cast(nullable->get_null_map_column()) .get_data(); - - for (size_t i = start; i < nullable->size(); i++) { - if (!nullmap[i]) { - _set.insert(col.get_data_at(i)); - } else { - _contains_null = true; - } + if (nullable->get_nested_column().is_column_string64()) { + _insert_fixed_len_string(assert_cast( + nullable->get_nested_column()), + nullmap.data(), start, nullmap.size()); + } else { + _insert_fixed_len_string( + assert_cast(nullable->get_nested_column()), + nullmap.data(), start, nullmap.size()); } } else { - const auto& col = assert_cast(column.get()); - for (size_t i = start; i < col->size(); i++) { - _set.insert(col->get_data_at(i)); + if (column->is_column_string64()) { + _insert_fixed_len_string(assert_cast(*column), + nullptr, start, column->size()); + } else { + _insert_fixed_len_string(assert_cast(*column), + nullptr, start, column->size()); } } } diff --git a/be/src/exprs/minmax_predicate.h b/be/src/exprs/minmax_predicate.h index cf746dc17cc504..b4291e2edb7e6b 100644 --- a/be/src/exprs/minmax_predicate.h +++ b/be/src/exprs/minmax_predicate.h @@ -70,11 +70,10 @@ class MinMaxNumFunc : public MinMaxFuncBase { } } - void update_batch(const vectorized::ColumnPtr& column, size_t start) { - const auto size = column->size(); - if constexpr (std::is_same_v) { - const auto& column_string = assert_cast(*column); - for (size_t i = start; i < size; i++) { + void _update_batch_string(const auto& column_string, const uint8_t* __restrict nullmap, + size_t start, size_t size) { + for (size_t i = start; i < size; i++) { + if (nullmap == nullptr || !nullmap[i]) { if constexpr (NeedMin) { _min = std::min(_min, column_string.get_data_at(i)); } @@ -82,7 +81,20 @@ class MinMaxNumFunc : public MinMaxFuncBase { _max = std::max(_max, column_string.get_data_at(i)); } } - store_string_ref(); + } + store_string_ref(); + } + + void update_batch(const vectorized::ColumnPtr& column, size_t start) { + const auto size = column->size(); + if constexpr (std::is_same_v) { + if (column->is_column_string64()) { + _update_batch_string(assert_cast(*column), + nullptr, start, size); + } else { + _update_batch_string(assert_cast(*column), nullptr, + start, size); + } } else { const T* data = (T*)column->get_raw_data().data; for (size_t i = start; i < size; i++) { @@ -100,18 +112,13 @@ class MinMaxNumFunc : public MinMaxFuncBase { size_t start) { const auto size = column->size(); if constexpr (std::is_same_v) { - const auto& column_string = assert_cast(*column); - for (size_t i = start; i < size; i++) { - if (!nullmap[i]) { - if constexpr (NeedMin) { - _min = std::min(_min, column_string.get_data_at(i)); - } - if constexpr (NeedMax) { - _max = std::max(_max, column_string.get_data_at(i)); - } - } + if (column->is_column_string64()) { + _update_batch_string(assert_cast(*column), + nullmap.data(), start, size); + } else { + _update_batch_string(assert_cast(*column), + nullmap.data(), start, size); } - store_string_ref(); } else { const T* data = (T*)column->get_raw_data().data; for (size_t i = start; i < size; i++) { From 216b0af8dd151ee910f4dd288db416fbbe762d5b Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 13:31:58 +0800 Subject: [PATCH 07/10] change code --- be/src/vec/columns/column.h | 2 +- be/src/vec/columns/column_string.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 4c1631ba339a7f..8fdc01701c8e90 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -99,7 +99,7 @@ class IColumn : public COW { */ virtual Ptr convert_to_full_column_if_const() const { return get_ptr(); } - /** If in join. the StringColumn size may overflow uint32_t, we need convert to uint64_t to ColumnLargeStringForJoin + /** If in join. the StringColumn size may overflow uint32_t, we need convert to uint64_t to ColumnString64 * The Column: ColumnString, ColumnNullable, ColumnArray, ColumnStruct need impl the code */ virtual Ptr convert_column_if_overflow() { return get_ptr(); } diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index d336615044bed6..bf7a7f91ba5d86 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -572,7 +572,8 @@ ColumnPtr ColumnStr::index(const IColumn& indexes, size_t limit) const { template ColumnPtr ColumnStr::convert_column_if_overflow() { - if (std::is_same_v && chars.size() > 10) { + // TODO: Try to fuzzy the overflow size to test more case in CI + if (std::is_same_v && chars.size() > std::numeric_limits::max()) { auto new_col = ColumnStr::create(); const auto length = offsets.size(); From 3c61a4b1e1da8c04a98c0d984a776f693d89a2cf Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 13:50:47 +0800 Subject: [PATCH 08/10] change by cr --- be/src/vec/functions/in.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/be/src/vec/functions/in.h b/be/src/vec/functions/in.h index 860df856be7ae0..ef6ffc520f8a78 100644 --- a/be/src/vec/functions/in.h +++ b/be/src/vec/functions/in.h @@ -50,8 +50,6 @@ #include "vec/data_types/data_type_number.h" #include "vec/functions/function.h" -namespace doris {} // namespace doris - namespace doris::vectorized { struct InState { From 14a1b54cfa5dd0f9d1161f055a574791d4b359cf Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 14:04:43 +0800 Subject: [PATCH 09/10] change comment --- be/src/vec/columns/column_string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index a06ff062550686..8ba20f819ee952 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. // This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnStr.h +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnString.h // and modified by Doris #pragma once From 34f195c557c5e64b662cc597e112b50b18fec600 Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 17 Apr 2024 18:53:26 +0800 Subject: [PATCH 10/10] impl the column string insert_range_from_ignore_overflow --- be/src/vec/columns/column_string.cpp | 37 +++++++++++++++++++++++++++- be/src/vec/columns/column_string.h | 3 +++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index bf7a7f91ba5d86..634c51c47c376d 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -93,13 +93,48 @@ MutableColumnPtr ColumnStr::get_shrinked_column() { return shrinked_column; } +template +void ColumnStr::insert_range_from_ignore_overflow(const doris::vectorized::IColumn& src, + size_t start, size_t length) { + if (length == 0) { + return; + } + + const auto& src_concrete = assert_cast&>(src); + if (start + length > src_concrete.offsets.size()) { + throw doris::Exception( + doris::ErrorCode::INTERNAL_ERROR, + "Parameter out of bound in IColumnStr::insert_range_from method."); + } + + size_t nested_offset = src_concrete.offset_at(start); + size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset; + + size_t old_chars_size = chars.size(); + chars.resize(old_chars_size + nested_length); + memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length); + + if (start == 0 && offsets.empty()) { + offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length); + } else { + size_t old_size = offsets.size(); + size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray + offsets.resize(old_size + length); + + for (size_t i = 0; i < length; ++i) { + offsets[old_size + i] = + src_concrete.offsets[start + i] - nested_offset + prev_max_offset; + } + } +} + template void ColumnStr::insert_range_from(const IColumn& src, size_t start, size_t length) { if (length == 0) { return; } - const ColumnStr& src_concrete = assert_cast&>(src); + const auto& src_concrete = assert_cast&>(src); if (start + length > src_concrete.offsets.size()) { throw doris::Exception( diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 8ba20f819ee952..19e62c1a59ae55 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -482,6 +482,9 @@ class ColumnStr final : public COWHelper> { void insert_range_from(const IColumn& src, size_t start, size_t length) override; + void insert_range_from_ignore_overflow(const IColumn& src, size_t start, + size_t length) override; + void insert_indices_from(const IColumn& src, const uint32_t* indices_begin, const uint32_t* indices_end) override;