diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index c5aec41aeefcb4..a65cdf2130c800 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -20,6 +20,7 @@ #include "common/logging.h" #include "gutil/strings/substitute.h" // for Substitute #include "olap/rowset/segment_v2/bitshuffle_page.h" +#include "runtime/mem_pool.h" #include "util/slice.h" // for Slice namespace doris { @@ -238,8 +239,8 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { // dictionary encoding DCHECK(_parsed); DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr"; + if (PREDICT_FALSE(*n == 0)) { - *n = 0; return Status::OK(); } Slice* out = reinterpret_cast(dst->data()); @@ -248,21 +249,37 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { ColumnBlock column_block(_batch.get(), dst->column_block()->pool()); ColumnBlockView tmp_block_view(&column_block); RETURN_IF_ERROR(_data_page_decoder->next_batch(n, &tmp_block_view)); - for (int i = 0; i < *n; ++i) { + const auto len = *n; + + size_t mem_len[len]; + for (int i = 0; i < len; ++i) { int32_t codeword = *reinterpret_cast(column_block.cell_ptr(i)); // get the string from the dict decoder - Slice element = _dict_decoder->string_at_index(codeword); - if (element.size > 0) { - char* destination = (char*)dst->column_block()->pool()->allocate(element.size); - if (destination == nullptr) { - return Status::MemoryAllocFailed( - strings::Substitute("memory allocate failed, size:$0", element.size)); - } - element.relocate(destination); - } - *out = element; + *out = _dict_decoder->string_at_index(codeword); + mem_len[i] = out->size; + out++; + } + + // use SIMD instruction to speed up call function `RoundUpToPowerOfTwo` + auto mem_size = 0; + for (int i = 0; i < len; ++i) { + mem_len[i] = BitUtil::RoundUpToPowerOf2Int32(mem_len[i], MemPool::DEFAULT_ALIGNMENT); + mem_size += mem_len[i]; + } + + // allocate a batch of memory and do memcpy + out = reinterpret_cast(dst->data()); + char* destination = (char*)dst->column_block()->pool()->allocate(mem_size); + if (destination == nullptr) { + return Status::MemoryAllocFailed( + strings::Substitute("memory allocate failed, size:$0", mem_size)); + } + for (int i = 0; i < len; ++i) { + out->relocate(destination); + destination += mem_len[i]; ++out; } + return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index bde3ae0dc9a826..97e7fa8bbcf622 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -29,6 +29,7 @@ #pragma once #include "common/logging.h" +#include "gutil/strings/substitute.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/options.h" #include "olap/rowset/segment_v2/page_builder.h" @@ -193,18 +194,33 @@ class BinaryPlainPageDecoder : public PageDecoder { *n = 0; return Status::OK(); } - size_t max_fetch = std::min(*n, static_cast(_num_elems - _cur_idx)); + const size_t max_fetch = std::min(*n, static_cast(_num_elems - _cur_idx)); Slice* out = reinterpret_cast(dst->data()); - + size_t mem_len[max_fetch]; for (size_t i = 0; i < max_fetch; i++, out++, _cur_idx++) { - Slice elem(string_at_index(_cur_idx)); - out->size = elem.size; - if (elem.size != 0) { - out->data = - reinterpret_cast(dst->pool()->allocate(elem.size * sizeof(uint8_t))); - memcpy(out->data, elem.data, elem.size); - } + *out = string_at_index(_cur_idx); + mem_len[i] = out->size; + } + + // use SIMD instruction to speed up call function `RoundUpToPowerOfTwo` + auto mem_size = 0; + for (int i = 0; i < max_fetch; ++i) { + mem_len[i] = BitUtil::RoundUpToPowerOf2Int32(mem_len[i], MemPool::DEFAULT_ALIGNMENT); + mem_size += mem_len[i]; + } + + // allocate a batch of memory and do memcpy + out = reinterpret_cast(dst->data()); + char* destination = (char*)dst->column_block()->pool()->allocate(mem_size); + if (destination == nullptr) { + return Status::MemoryAllocFailed( + strings::Substitute("memory allocate failed, size:$0", mem_size)); + } + for (int i = 0; i < max_fetch; ++i) { + out->relocate(destination); + destination += mem_len[i]; + ++out; } *n = max_fetch; diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h index 0290361ba7e8f3..3a3750eda79ea7 100644 --- a/be/src/runtime/mem_pool.h +++ b/be/src/runtime/mem_pool.h @@ -161,7 +161,7 @@ class MemPool { MemTracker* mem_tracker() { return mem_tracker_; } - static const int DEFAULT_ALIGNMENT = 8; + static constexpr int DEFAULT_ALIGNMENT = 8; private: friend class MemPoolTest; diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index d49e48325e4668..a4bf2ef6eb25bd 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -300,6 +300,12 @@ class BitUtil { return (value + (factor - 1)) & ~(factor - 1); } + // speed up function compute for SIMD + static inline size_t RoundUpToPowerOf2Int32(size_t value, size_t factor) { + DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); + return (value + (factor - 1)) & ~(factor - 1); + } + // Returns the ceil of value/divisor static inline int Ceil(int value, int divisor) { return value / divisor + (value % divisor != 0);