From 465e91d6cc1f4549198dc2882740ba1535125207 Mon Sep 17 00:00:00 2001 From: HappenLee Date: Wed, 23 Jun 2021 19:41:51 +0800 Subject: [PATCH 1/2] SIMD instruction speed up the storage layer --- .../rowset/segment_v2/binary_dict_page.cpp | 41 +++++++++++++------ .../rowset/segment_v2/binary_plain_page.h | 34 +++++++++++---- be/src/runtime/mem_pool.h | 2 +- be/src/util/bit_util.h | 5 +++ 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index c5aec41aeefcb4..2c14511b50ff34 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -20,6 +20,7 @@ #include "common/logging.h" #include "gutil/strings/substitute.h" // for Substitute #include "olap/rowset/segment_v2/bitshuffle_page.h" +#include "runtime/mem_pool.h" #include "util/slice.h" // for Slice namespace doris { @@ -238,8 +239,8 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { // dictionary encoding DCHECK(_parsed); DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr"; + if (PREDICT_FALSE(*n == 0)) { - *n = 0; return Status::OK(); } Slice* out = reinterpret_cast(dst->data()); @@ -248,21 +249,37 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { ColumnBlock column_block(_batch.get(), dst->column_block()->pool()); ColumnBlockView tmp_block_view(&column_block); RETURN_IF_ERROR(_data_page_decoder->next_batch(n, &tmp_block_view)); - for (int i = 0; i < *n; ++i) { + const auto len = *n; + + std::vector mem_len(len); + for (int i = 0; i < len; ++i) { int32_t codeword = *reinterpret_cast(column_block.cell_ptr(i)); // get the string from the dict decoder - Slice element = _dict_decoder->string_at_index(codeword); - if (element.size > 0) { - char* destination = (char*)dst->column_block()->pool()->allocate(element.size); - if (destination == nullptr) { - return Status::MemoryAllocFailed( - strings::Substitute("memory allocate failed, size:$0", element.size)); - } - element.relocate(destination); - } - *out = element; + *out = _dict_decoder->string_at_index(codeword); + mem_len[i] = out->size; + out++; + } + + // use SIMD instruction to speed up call function `RoundUpToPowerOfTwo` + auto mem_size = 0; + for (int i = 0; i < len; ++i) { + mem_len[i] = BitUtil::RoundUpToPowerOf2Int32(mem_len[i], MemPool::DEFAULT_ALIGNMENT); + mem_size += mem_len[i]; + } + + // allocate a batch of memory and do memcpy + out = reinterpret_cast(dst->data()); + char* destination = (char*)dst->column_block()->pool()->allocate(mem_size); + if (destination == nullptr) { + return Status::MemoryAllocFailed( + strings::Substitute("memory allocate failed, size:$0", mem_size)); + } + for (int i = 0; i < len; ++i) { + out->relocate(destination); + destination += mem_len[i]; ++out; } + return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index bde3ae0dc9a826..88c9169c7bf866 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -29,6 +29,7 @@ #pragma once #include "common/logging.h" +#include "gutil/strings/substitute.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/options.h" #include "olap/rowset/segment_v2/page_builder.h" @@ -193,18 +194,33 @@ class BinaryPlainPageDecoder : public PageDecoder { *n = 0; return Status::OK(); } - size_t max_fetch = std::min(*n, static_cast(_num_elems - _cur_idx)); + const size_t max_fetch = std::min(*n, static_cast(_num_elems - _cur_idx)); Slice* out = reinterpret_cast(dst->data()); - + std::vector mem_len(max_fetch); for (size_t i = 0; i < max_fetch; i++, out++, _cur_idx++) { - Slice elem(string_at_index(_cur_idx)); - out->size = elem.size; - if (elem.size != 0) { - out->data = - reinterpret_cast(dst->pool()->allocate(elem.size * sizeof(uint8_t))); - memcpy(out->data, elem.data, elem.size); - } + *out = string_at_index(_cur_idx); + mem_len[i] = out->size; + } + + // use SIMD instruction to speed up call function `RoundUpToPowerOfTwo` + auto mem_size = 0; + for (int i = 0; i < max_fetch; ++i) { + mem_len[i] = BitUtil::RoundUpToPowerOf2Int32(mem_len[i], MemPool::DEFAULT_ALIGNMENT); + mem_size += mem_len[i]; + } + + // allocate a batch of memory and do memcpy + out = reinterpret_cast(dst->data()); + char* destination = (char*)dst->column_block()->pool()->allocate(mem_size); + if (destination == nullptr) { + return Status::MemoryAllocFailed( + strings::Substitute("memory allocate failed, size:$0", mem_size)); + } + for (int i = 0; i < max_fetch; ++i) { + out->relocate(destination); + destination += mem_len[i]; + ++out; } *n = max_fetch; diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h index 0290361ba7e8f3..3a3750eda79ea7 100644 --- a/be/src/runtime/mem_pool.h +++ b/be/src/runtime/mem_pool.h @@ -161,7 +161,7 @@ class MemPool { MemTracker* mem_tracker() { return mem_tracker_; } - static const int DEFAULT_ALIGNMENT = 8; + static constexpr int DEFAULT_ALIGNMENT = 8; private: friend class MemPoolTest; diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index d49e48325e4668..656e70b28177e3 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -300,6 +300,11 @@ class BitUtil { return (value + (factor - 1)) & ~(factor - 1); } + // speed up function compute for SIMD + static inline size_t RoundUpToPowerOf2Int32(size_t value, size_t factor) { + return (value + (factor - 1)) & ~(factor - 1); + } + // Returns the ceil of value/divisor static inline int Ceil(int value, int divisor) { return value / divisor + (value % divisor != 0); From 8c4ceb6e7aa050eaa795547662a60d29f942658d Mon Sep 17 00:00:00 2001 From: HappenLee Date: Thu, 24 Jun 2021 15:06:16 +0800 Subject: [PATCH 2/2] 1. add DECHECK in power of 2 int32 2. change vector to array deduce the cost --- be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 2 +- be/src/olap/rowset/segment_v2/binary_plain_page.h | 2 +- be/src/util/bit_util.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 2c14511b50ff34..a65cdf2130c800 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -251,7 +251,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { RETURN_IF_ERROR(_data_page_decoder->next_batch(n, &tmp_block_view)); const auto len = *n; - std::vector mem_len(len); + size_t mem_len[len]; for (int i = 0; i < len; ++i) { int32_t codeword = *reinterpret_cast(column_block.cell_ptr(i)); // get the string from the dict decoder diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index 88c9169c7bf866..97e7fa8bbcf622 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -197,7 +197,7 @@ class BinaryPlainPageDecoder : public PageDecoder { const size_t max_fetch = std::min(*n, static_cast(_num_elems - _cur_idx)); Slice* out = reinterpret_cast(dst->data()); - std::vector mem_len(max_fetch); + size_t mem_len[max_fetch]; for (size_t i = 0; i < max_fetch; i++, out++, _cur_idx++) { *out = string_at_index(_cur_idx); mem_len[i] = out->size; diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index 656e70b28177e3..a4bf2ef6eb25bd 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -302,6 +302,7 @@ class BitUtil { // speed up function compute for SIMD static inline size_t RoundUpToPowerOf2Int32(size_t value, size_t factor) { + DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); return (value + (factor - 1)) & ~(factor - 1); }