diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 74a79722179296..c2ea031f5e3980 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -192,11 +192,11 @@ set_target_properties(librdkafka PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/ add_library(libs2 STATIC IMPORTED) set_target_properties(libs2 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libs2.a) -add_library(libbitshuffle STATIC IMPORTED) -set_target_properties(libbitshuffle PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libbitshuffle.a) +add_library(bitshuffle STATIC IMPORTED) +set_target_properties(bitshuffle PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libbitshuffle.a) -add_library(libroaring STATIC IMPORTED) -set_target_properties(libroaring PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libroaring.a) +add_library(roaring STATIC IMPORTED) +set_target_properties(roaring PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libroaring.a) find_program(THRIFT_COMPILER thrift ${CMAKE_SOURCE_DIR}/bin) @@ -496,6 +496,8 @@ set(DORIS_DEPENDENCIES openssl crypto leveldb + bitshuffle + roaring ${WL_END_GROUP} ) diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt index 51c76663a0159b..da809926116306 100644 --- a/be/src/olap/CMakeLists.txt +++ b/be/src/olap/CMakeLists.txt @@ -20,7 +20,7 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/olap") # where to put generated binaries set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/olap") - + add_library(Olap STATIC aggregate_func.cpp base_compaction.cpp @@ -85,4 +85,5 @@ add_library(Olap STATIC wrapper_field.cpp rowset/segment_v2/ordinal_page_index.cpp rowset/segment_v2/encoding_info.cpp + rowset/segment_v2/bitshuffle_wrapper.cpp ) diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h b/be/src/olap/rowset/segment_v2/bitshuffle_page.h new file mode 100644 index 00000000000000..89cb486c90eb80 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "util/coding.h" +#include "util/faststring.h" +#include "gutil/port.h" +#include "olap/olap_common.h" +#include "olap/types.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" + +namespace doris { +namespace segment_v2 { + +void warn_with_bitshuffle_error(int64_t val) { + switch (val) { + case -1: + LOG(WARNING) << "Failed to allocate memory"; + break; + case -11: + LOG(WARNING) << "Missing SSE"; + break; + case -12: + LOG(WARNING) << "Missing AVX"; + break; + case -80: + LOG(WARNING) << "Input size not a multiple of 8"; + break; + case -81: + LOG(WARNING) << "block_size not multiple of 8"; + break; + case -91: + LOG(WARNING) << "Decompression error, wrong number of bytes processed"; + break; + default: + LOG(WARNING) << "Error internal to compression routine"; + } +} + +// BitshufflePageBuilder bitshuffles and compresses the bits of fixed +// size type blocks with lz4. +// +// The page format is as follows: +// +// 1. Header: (20 bytes total) +// +// [32-bit] +// The ordinal offset of the first element in the page. +// +// [32-bit] +// The number of elements encoded in the page. +// +// [32-bit] +// The post-compression size of the page, including this header. +// +// [32-bit] +// Padding is needed to meet the requirements of the bitshuffle +// library such that the input/output is a multiple of 8. Some +// ignored elements are appended to the end of the page if necessary +// to meet this requirement. +// +// This header field is the post-padding element count. +// +// [32-bit] +// The size of the elements, in bytes, as actually encoded. In the +// case that all of the data in a page can fit into a smaller +// integer type, then we may choose to encode that smaller type +// to save CPU costs. +// +// This is currently only implemented in the UINT32 page type. +// +// NOTE: all on-disk ints are encoded little-endian +// +// 2. Element data +// +// The header is followed by the bitshuffle-compressed element data. +// +template +class BitshufflePageBuilder : public PageBuilder { +public: + BitshufflePageBuilder(PageBuilderOptions options) : + _options(std::move(options)), + _count(0), + _remain_element_capacity(0), + _finished(false) { + reset(); + } + + bool is_page_full() override { + return _remain_element_capacity == 0; + } + + Status add(const uint8_t* vals, size_t* count) override { + DCHECK(!_finished); + int to_add = std::min(_remain_element_capacity, *count); + _data.append(vals, to_add * SIZE_OF_TYPE); + _count += to_add; + _remain_element_capacity -= to_add; + // return added number through count + *count = to_add; + return Status::OK(); + } + + Status get_dictionary_page(Slice* dictionary_page) override { + return Status::NotSupported("get_dictionary_page not supported in bitshuffle page builder"); + } + + Slice finish(rowid_t page_first_rowid) override { + return _finish(page_first_rowid, SIZE_OF_TYPE); + } + + void reset() override { + auto block_size = _options.data_page_size; + _count = 0; + _data.clear(); + _data.reserve(block_size); + DCHECK_EQ(reinterpret_cast(_data.data()) & (alignof(CppType) - 1), 0) + << "buffer must be naturally-aligned"; + _buffer.clear(); + _buffer.resize(HEADER_SIZE); + _finished = false; + _remain_element_capacity = block_size / SIZE_OF_TYPE; + } + + size_t count() const { + return _count; + } + + // this api will release the memory ownership of encoded data + // Note: + // release() should be called after finish + // reset() should be called after this function before reuse the builder + void release() override { + uint8_t* ret = _buffer.release(); + (void)ret; + } + +private: + Slice _finish(rowid_t page_first_rowid, int final_size_of_type) { + _data.resize(HEADER_SIZE + final_size_of_type * _count); + + // Do padding so that the input num of element is multiple of 8. + int num_elems_after_padding = ALIGN_UP(_count, 8); + int padding_elems = num_elems_after_padding - _count; + int padding_bytes = padding_elems * final_size_of_type; + for (int i = 0; i < padding_bytes; i++) { + _data.push_back(0); + } + + _buffer.resize(HEADER_SIZE + + bitshuffle::compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); + + encode_fixed32_le(&_buffer[0], page_first_rowid); + encode_fixed32_le(&_buffer[4], _count); + int64_t bytes = bitshuffle::compress_lz4(_data.data(), &_buffer[HEADER_SIZE], + num_elems_after_padding, final_size_of_type, 0); + if (PREDICT_FALSE(bytes < 0)) { + // This means the bitshuffle function fails. + // Ideally, this should not happen. + warn_with_bitshuffle_error(bytes); + // It does not matter what will be returned here, + // since we have logged fatal in warn_with_bitshuffle_error(). + return Slice(); + } + encode_fixed32_le(&_buffer[8], HEADER_SIZE + bytes); + encode_fixed32_le(&_buffer[12], num_elems_after_padding); + encode_fixed32_le(&_buffer[16], final_size_of_type); + _finished = true; + return Slice(_buffer.data(), HEADER_SIZE + bytes); + } + + typedef typename TypeTraits::CppType CppType; + + CppType cell(int idx) const { + DCHECK_GE(idx, 0); + CppType ret; + memcpy(&ret, &_data[idx * SIZE_OF_TYPE], sizeof(CppType)); + return ret; + } + + // Length of a header. + static const size_t HEADER_SIZE = sizeof(uint32_t) * 5; + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + PageBuilderOptions _options; + uint32_t _count; + int _remain_element_capacity; + bool _finished; + faststring _data; + faststring _buffer; +}; + +template +class BitShufflePageDecoder : public PageDecoder { +public: + BitShufflePageDecoder(Slice data) : _data(data), + _parsed(false), + _page_first_ordinal(0), + _num_elements(0), + _compressed_size(0), + _num_element_after_padding(0), + _size_of_element(0), + _cur_index(0) { } + + Status init() override { + CHECK(!_parsed); + if (_data.size < HEADER_SIZE) { + std::stringstream ss; + ss << "file corrupton: invalid data size:" << _data.size << ", header size:" << HEADER_SIZE; + return Status::InternalError(ss.str()); + } + _page_first_ordinal = decode_fixed32_le((const uint8_t*)&_data[0]); + _num_elements = decode_fixed32_le((const uint8_t*)&_data[4]); + _compressed_size = decode_fixed32_le((const uint8_t*)&_data[8]); + if (_compressed_size != _data.size) { + std::stringstream ss; + ss << "Size information unmatched, _compressed_size:" << _compressed_size + << ", data size:" << _data.size; + return Status::InternalError(ss.str()); + } + _num_element_after_padding = decode_fixed32_le((const uint8_t*)&_data[12]); + if (_num_element_after_padding != ALIGN_UP(_num_elements, 8)) { + std::stringstream ss; + ss << "num of element information corrupted," + << " _num_element_after_padding:" << _num_element_after_padding + << ", _num_elements:" << _num_elements; + return Status::InternalError(ss.str()); + } + _size_of_element = decode_fixed32_le((const uint8_t*)&_data[16]); + switch (_size_of_element) { + case 1: + case 2: + case 4: + case 8: + case 16: + break; + default: + std::stringstream ss; + ss << "invalid size_of_elem:" << _size_of_element; + return Status::InternalError(ss.str()); + } + + // Currently, only the UINT32 block encoder supports expanding size: + if (UNLIKELY(Type != OLAP_FIELD_TYPE_UNSIGNED_INT && _size_of_element != SIZE_OF_TYPE)) { + std::stringstream ss; + ss << "invalid size info. size of element:" << _size_of_element + << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE + << ", type:" << Type; + return Status::InternalError(ss.str()); + } + if (UNLIKELY(_size_of_element > SIZE_OF_TYPE)) { + std::stringstream ss; + ss << "invalid size info. size of element:" << _size_of_element + << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE; + return Status::InternalError(ss.str()); + } + + RETURN_IF_ERROR(_decode()); + _parsed = true; + return Status::OK(); + } + + Status seek_to_position_in_page(size_t pos) override { + DCHECK(_parsed) << "Must call init()"; + if (PREDICT_FALSE(_num_elements == 0)) { + DCHECK_EQ(0, pos); + return Status::InvalidArgument("invalid pos"); + } + + DCHECK_LE(pos, _num_elements); + _cur_index = pos; + return Status::OK(); + } + + Status next_batch(size_t* n, ColumnVectorView* dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { + *n = 0; + return Status::OK(); + } + + size_t max_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); + _copy_next_values(max_fetch, dst->column_vector()->col_data()); + *n = max_fetch; + _cur_index += max_fetch; + + return Status::OK(); + } + + size_t count() const override { + return _num_elements; + } + + size_t current_index() const override { + return _cur_index; + } + + rowid_t get_first_rowid() const override { + return _page_first_ordinal; + } + +private: + void _copy_next_values(size_t n, void* data) { + memcpy(data, &_decoded[_cur_index * SIZE_OF_TYPE], n * SIZE_OF_TYPE); + } + + Status _decode() { + if (_num_elements > 0) { + int64_t bytes; + _decoded.resize(_num_element_after_padding * _size_of_element); + char* in = const_cast(&_data[HEADER_SIZE]); + bytes = bitshuffle::decompress_lz4(in, _decoded.data(), _num_element_after_padding, + _size_of_element, 0); + if (PREDICT_FALSE(bytes < 0)) { + // Ideally, this should not happen. + warn_with_bitshuffle_error(bytes); + return Status::RuntimeError("Unshuffle Process failed"); + } + } + return Status::OK(); + } + + typedef typename TypeTraits::CppType CppType; + + // Length of a header. + static const size_t HEADER_SIZE = sizeof(uint32_t) * 5; + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + + Slice _data; + bool _parsed; + rowid_t _page_first_ordinal; + size_t _num_elements; + size_t _compressed_size; + size_t _num_element_after_padding; + + int _size_of_element; + size_t _cur_index; + faststring _decoded; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp new file mode 100644 index 00000000000000..36ceb8ce392e9b --- /dev/null +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" + +// Include the bitshuffle header once to get the default (non-AVX2) +// symbols. +#include + +#include "gutil/cpu.h" + +// Include the bitshuffle header again, but this time importing the +// AVX2-compiled symbols by defining some macros. +#undef BITSHUFFLE_H +#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx2 +#define bshuf_compress_lz4 bshuf_compress_lz4_avx2 +#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx2 +#include // NOLINT(*) +#undef bshuf_compress_lz4_bound +#undef bshuf_compress_lz4 +#undef bshuf_decompress_lz4 + +using base::CPU; + +namespace doris { +namespace bitshuffle { + +// Function pointers which will be assigned the correct implementation +// for the runtime architecture. +namespace { +decltype(&bshuf_compress_lz4_bound) g_bshuf_compress_lz4_bound; +decltype(&bshuf_compress_lz4) g_bshuf_compress_lz4; +decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4; +} // anonymous namespace + +// When this translation unit is initialized, figure out the current CPU and +// assign the correct function for this architecture. +// +// This avoids an expensive 'cpuid' call in the hot path, and also avoids +// the cost of a 'std::once' call. +__attribute__((constructor)) +void SelectBitshuffleFunctions() { + if (CPU().has_avx2()) { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2; + g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2; + } else { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; + g_bshuf_compress_lz4 = bshuf_compress_lz4; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4; + } +} + +int64_t compress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4(in, out, size, elem_size, block_size); +} +int64_t decompress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_decompress_lz4(in, out, size, elem_size, block_size); +} +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4_bound(size, elem_size, block_size); +} + +} // namespace bitshuffle +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h new file mode 100644 index 00000000000000..38c1e7231f947c --- /dev/null +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +// This namespace has wrappers for the Bitshuffle library which do runtime dispatch to +// either AVX2-accelerated or regular SSE2 implementations based on the available CPU. +namespace doris { +namespace bitshuffle { + +// See for documentation on these functions. +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size); +int64_t compress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); +int64_t decompress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); + +} // namespace bitshuffle +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/common.h b/be/src/olap/rowset/segment_v2/common.h index 61e276f2dd77e6..ba69121cedc244 100644 --- a/be/src/olap/rowset/segment_v2/common.h +++ b/be/src/olap/rowset/segment_v2/common.h @@ -20,10 +20,16 @@ #include #include +// Round down 'x' to the nearest 'align' boundary +#define ALIGN_DOWN(x, align) ((x) & (~(align) + 1)) + +// Round up 'x' to the nearest 'align' boundary +#define ALIGN_UP(x, align) (((x) + ((align) - 1)) & (~(align) + 1)) + namespace doris { namespace segment_v2 { using rowid_t = uint32_t; -} -} +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h index 3fd26d7a45f318..f4bf20521c3b59 100644 --- a/be/src/olap/rowset/segment_v2/options.h +++ b/be/src/olap/rowset/segment_v2/options.h @@ -20,25 +20,13 @@ #include "gen_cpp/segment_v2.pb.h" namespace doris { - namespace segment_v2 { -struct BuilderOptions { - size_t data_page_size; - - size_t dict_page_size; - - bool write_posidx; - - EncodingTypePB encoding; +struct PageBuilderOptions { + size_t data_page_size = 0; - CompressionTypePB compression_type; - - bool is_nullable; - - bool has_dictionary; + size_t dict_page_size = 0; }; } // namespace segment_v2 - } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/page_builder.h b/be/src/olap/rowset/segment_v2/page_builder.h index e6952766f415cc..57239a4bce1b38 100644 --- a/be/src/olap/rowset/segment_v2/page_builder.h +++ b/be/src/olap/rowset/segment_v2/page_builder.h @@ -25,7 +25,6 @@ #include "olap/rowset/segment_v2/common.h" namespace doris { - namespace segment_v2 { // PageBuilder is used to build page @@ -37,6 +36,8 @@ namespace segment_v2 { // 5. Bitmap Index Page: store bitmap index of data class PageBuilder { public: + PageBuilder() { } + virtual ~PageBuilder() { } // Used by column writer to determine whether the current page is full. @@ -51,10 +52,7 @@ class PageBuilder { virtual doris::Status add(const uint8_t* vals, size_t* count) = 0; // Get the dictionary page for dictionary encoding mode column. - virtual doris::Status get_dictionary_page(doris::Slice* dictionary_page); - - // Get the bitmap page for bitmap indexed column. - virtual doris::Status get_bitmap_page(doris::Slice* bitmap_page); + virtual doris::Status get_dictionary_page(doris::Slice* dictionary_page) = 0; // Return a Slice which represents the encoded data of current page. // @@ -69,10 +67,15 @@ class PageBuilder { // Return the number of entries that have been added to the page. virtual size_t count() const = 0; + // This api is for release the resource owned by builder + // It means it will transfer the ownership of some resource to other. + // This api is always called after finish + // and should be followed by reset() before reuse the builder + virtual void release() = 0; + private: DISALLOW_COPY_AND_ASSIGN(PageBuilder); }; } // namespace segment_v2 - } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h b/be/src/olap/rowset/segment_v2/page_decoder.h index 972864235ac05b..a256978192af55 100644 --- a/be/src/olap/rowset/segment_v2/page_decoder.h +++ b/be/src/olap/rowset/segment_v2/page_decoder.h @@ -21,17 +21,18 @@ #include "common/status.h" namespace doris { - namespace segment_v2 { // PageDecoder is used to decode page page. class PageDecoder { public: + PageDecoder() { } + virtual ~PageDecoder() { } // Call this to do some preparation for decoder. // eg: parse data page header - virtual doris::Status init() = 0; + virtual Status init() = 0; // Seek the decoder to the given positional index of the page. // For example, seek_to_position_in_page(0) seeks to the first @@ -39,7 +40,7 @@ class PageDecoder { // // It is an error to call this with a value larger than Count(). // Doing so has undefined results. - virtual doris::Status seek_to_position_in_page(size_t pos) = 0; + virtual Status seek_to_position_in_page(size_t pos) = 0; // Seek the decoder forward by a given number of rows, or to the end // of the page. This is primarily used to skip over data. @@ -60,7 +61,7 @@ class PageDecoder { // In the case that the values are themselves references // to other memory (eg Slices), the referred-to memory is // allocated in the column_vector_view's mem_pool. - virtual doris::Status next_batch(size_t* n, doris::ColumnVectorView* column_vector_view) = 0; + virtual Status next_batch(size_t* n, ColumnVectorView* dst) = 0; // Return the number of elements in this page. virtual size_t count() const = 0; @@ -77,5 +78,4 @@ class PageDecoder { }; } // namespace segment_v2 - } // namespace doris diff --git a/be/src/util/slice.h b/be/src/util/slice.h index 85176f62d297e8..5c31072fe2e1e4 100644 --- a/be/src/util/slice.h +++ b/be/src/util/slice.h @@ -71,22 +71,20 @@ struct Slice { Slice(const char* s) : // NOLINT(runtime/explicit) data(const_cast(s)), size(strlen(s)) { } - /* /// @return A pointer to the beginning of the referenced data. - const char* data() const { return data; } + const char* get_data() const { return data; } /// @return A mutable pointer to the beginning of the referenced data. char* mutable_data() { return const_cast(data); } /// @return The length (in bytes) of the referenced data. - size_t size() const { return size; } - */ + size_t get_size() const { return size; } /// @return @c true iff the length of the referenced data is zero. bool empty() const { return size == 0; } /// @return the n-th byte in the referenced data. - const char operator[](size_t n) const { + const char& operator[](size_t n) const { assert(n < size); return data[n]; } diff --git a/be/test/olap/CMakeLists.txt b/be/test/olap/CMakeLists.txt index 3084c03f047188..f5bed29fd6fca3 100644 --- a/be/test/olap/CMakeLists.txt +++ b/be/test/olap/CMakeLists.txt @@ -46,3 +46,4 @@ ADD_BE_TEST(olap_header_manager_test) ADD_BE_TEST(field_info_test) ADD_BE_TEST(rowset/segment_v2/ordinal_page_index_test) ADD_BE_TEST(rowset/segment_v2/encoding_info_test) +ADD_BE_TEST(rowset/segment_v2/bitshuffle_page_test) diff --git a/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp new file mode 100644 index 00000000000000..06a1d05fc1028b --- /dev/null +++ b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/bitshuffle_page.h" +#include "util/logging.h" + +using doris::segment_v2::PageBuilderOptions; + +namespace doris { + +class BitShufflePageTest : public testing::Test { +public: + virtual ~BitShufflePageTest() {} + + template + void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { + std::unique_ptr dst_vector(new ColumnVector()); + dst_vector->set_col_data((void*)ret); + std::unique_ptr mem_tracer(new MemTracker(-1)); + std::unique_ptr mem_pool(new MemPool(mem_tracer.get())); + ColumnVectorView column_vector_view(dst_vector.get(), 0, mem_pool.get()); + size_t n = 1; + decoder->_copy_next_values(n, column_vector_view.column_vector()->col_data()); + ASSERT_EQ(1, n); + } + + template + void test_encode_decode_page_template(typename TypeTraits::CppType* src, + size_t size) { + typedef typename TypeTraits::CppType CppType; + const size_t ordinal_pos_base = 12345; + PageBuilderOptions options; + options.data_page_size = 256 * 1024; + PageBuilderType page_builder(options); + + page_builder.add(reinterpret_cast(src), &size); + Slice s = page_builder.finish(ordinal_pos_base); + + PageDecoderType page_decoder(s); + Status status = page_decoder.init(); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(ordinal_pos_base, page_decoder.get_first_rowid()); + ASSERT_EQ(0, page_decoder.current_index()); + + std::unique_ptr dst_vector(new ColumnVector()); + std::unique_ptr mem_tracer(new MemTracker(-1)); + std::unique_ptr mem_pool(new MemPool(mem_tracer.get())); + CppType* values = reinterpret_cast(mem_pool->allocate(size * sizeof(CppType))); + dst_vector->set_col_data(values); + ColumnVectorView column_vector_view(dst_vector.get(), 0, mem_pool.get()); + status = page_decoder.next_batch(&size, &column_vector_view); + ASSERT_TRUE(status.ok()); + + CppType* decoded = (CppType*)dst_vector->col_data(); + for (uint i = 0; i < size; i++) { + if (src[i] != decoded[i]) { + FAIL() << "Fail at index " << i << + " inserted=" << src[i] << " got=" << decoded[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % size; + page_decoder.seek_to_position_in_page(seek_off); + EXPECT_EQ((int32_t )(seek_off), page_decoder.current_index()); + CppType ret; + copy_one(&page_decoder, &ret); + EXPECT_EQ(decoded[seek_off], ret); + } + } +}; + +// Test for bitshuffle block, for INT32, INT64, FLOAT, DOUBLE +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt64BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int64_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleFloatBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr floats(new float[size]); + for (int i = 0; i < size; i++) { + floats.get()[i] = random() + static_cast(random())/INT_MAX; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(floats.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + doubles.get()[i] = random() + static_cast(random())/INT_MAX; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + doubles.get()[i] = 19880217.19890323; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderSequence) { + const uint32_t size = 10000; + + double base = 19880217.19890323; + double delta = 13.14; + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + base = base + delta; + doubles.get()[i] = base; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 12345; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderSequence) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + int32_t number = 0; + for (int i = 0; i < size; i++) { + ints.get()[i] = ++number; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/run-ut.sh b/run-ut.sh index 203bdd9c88be55..8736262553c223 100755 --- a/run-ut.sh +++ b/run-ut.sh @@ -131,114 +131,111 @@ if [ -d ${DORIS_TEST_BINARY_DIR}/util/test_data ]; then fi cp -r ${DORIS_HOME}/be/test/util/test_data ${DORIS_TEST_BINARY_DIR}/util/ -# Running common Unittest -${DORIS_TEST_BINARY_DIR}/common/status_test - -# Running Util Unittest -${DORIS_TEST_BINARY_DIR}/util/bit_util_test -${DORIS_TEST_BINARY_DIR}/util/bitmap_test -${DORIS_TEST_BINARY_DIR}/util/path_trie_test -${DORIS_TEST_BINARY_DIR}/util/count_down_latch_test -${DORIS_TEST_BINARY_DIR}/util/lru_cache_util_test -${DORIS_TEST_BINARY_DIR}/util/filesystem_util_test -${DORIS_TEST_BINARY_DIR}/util/internal_queue_test -${DORIS_TEST_BINARY_DIR}/util/cidr_test -${DORIS_TEST_BINARY_DIR}/util/new_metrics_test -${DORIS_TEST_BINARY_DIR}/util/doris_metrics_test -${DORIS_TEST_BINARY_DIR}/util/system_metrics_test -${DORIS_TEST_BINARY_DIR}/util/core_local_test -${DORIS_TEST_BINARY_DIR}/util/arena_test -${DORIS_TEST_BINARY_DIR}/util/types_test -${DORIS_TEST_BINARY_DIR}/util/json_util_test -${DORIS_TEST_BINARY_DIR}/util/byte_buffer_test2 -${DORIS_TEST_BINARY_DIR}/util/uid_util_test -${DORIS_TEST_BINARY_DIR}/util/aes_util_test -${DORIS_TEST_BINARY_DIR}/util/string_util_test -${DORIS_TEST_BINARY_DIR}/util/coding_test -${DORIS_TEST_BINARY_DIR}/util/faststring_test -${DORIS_TEST_BINARY_DIR}/util/rle_encoding_test - -## Running common Unittest -${DORIS_TEST_BINARY_DIR}/common/resource_tls_test - -## Running exprs unit test -${DORIS_TEST_BINARY_DIR}/exprs/string_functions_test -${DORIS_TEST_BINARY_DIR}/exprs/json_function_test - -## Running geo unit test -${DORIS_TEST_BINARY_DIR}/geo/geo_functions_test -${DORIS_TEST_BINARY_DIR}/geo/wkt_parse_test -${DORIS_TEST_BINARY_DIR}/geo/geo_types_test - -## Running exec unit test -${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_uncompressed_test -${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_gzip_test -${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_bzip_test -${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_lz4frame_test -if [ -f ${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_lzop_test ];then - ${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_lzop_test -fi -${DORIS_TEST_BINARY_DIR}/exec/broker_scanner_test -${DORIS_TEST_BINARY_DIR}/exec/broker_scan_node_test -${DORIS_TEST_BINARY_DIR}/exec/es_scan_node_test -${DORIS_TEST_BINARY_DIR}/exec/es_http_scan_node_test -${DORIS_TEST_BINARY_DIR}/exec/es_predicate_test -${DORIS_TEST_BINARY_DIR}/exec/es_scan_reader_test -${DORIS_TEST_BINARY_DIR}/exec/es_query_builder_test -${DORIS_TEST_BINARY_DIR}/exec/olap_table_info_test -${DORIS_TEST_BINARY_DIR}/exec/olap_table_sink_test - -## Running runtime Unittest -${DORIS_TEST_BINARY_DIR}/runtime/fragment_mgr_test -${DORIS_TEST_BINARY_DIR}/runtime/decimal_value_test -${DORIS_TEST_BINARY_DIR}/runtime/datetime_value_test -${DORIS_TEST_BINARY_DIR}/runtime/large_int_value_test -${DORIS_TEST_BINARY_DIR}/runtime/string_value_test -${DORIS_TEST_BINARY_DIR}/runtime/free_list_test -${DORIS_TEST_BINARY_DIR}/runtime/string_buffer_test -${DORIS_TEST_BINARY_DIR}/runtime/stream_load_pipe_test -${DORIS_TEST_BINARY_DIR}/runtime/tablet_writer_mgr_test -${DORIS_TEST_BINARY_DIR}/runtime/snapshot_loader_test -${DORIS_TEST_BINARY_DIR}/runtime/user_function_cache_test -${DORIS_TEST_BINARY_DIR}/runtime/small_file_mgr_test -# Running expr Unittest - -# Running http -${DORIS_TEST_BINARY_DIR}/http/metrics_action_test -${DORIS_TEST_BINARY_DIR}/http/http_utils_test -${DORIS_TEST_BINARY_DIR}/http/stream_load_test -${DORIS_TEST_BINARY_DIR}/http/http_client_test - -# Running OLAPEngine Unittest -${DORIS_TEST_BINARY_DIR}/olap/bit_field_test -${DORIS_TEST_BINARY_DIR}/olap/byte_buffer_test -${DORIS_TEST_BINARY_DIR}/olap/run_length_byte_test -${DORIS_TEST_BINARY_DIR}/olap/run_length_integer_test -${DORIS_TEST_BINARY_DIR}/olap/stream_index_test -${DORIS_TEST_BINARY_DIR}/olap/lru_cache_test -${DORIS_TEST_BINARY_DIR}/olap/bloom_filter_test -${DORIS_TEST_BINARY_DIR}/olap/bloom_filter_index_test -${DORIS_TEST_BINARY_DIR}/olap/row_block_test -${DORIS_TEST_BINARY_DIR}/olap/comparison_predicate_test -${DORIS_TEST_BINARY_DIR}/olap/in_list_predicate_test -${DORIS_TEST_BINARY_DIR}/olap/null_predicate_test -${DORIS_TEST_BINARY_DIR}/olap/file_helper_test -${DORIS_TEST_BINARY_DIR}/olap/file_utils_test -${DORIS_TEST_BINARY_DIR}/olap/delete_handler_test -${DORIS_TEST_BINARY_DIR}/olap/column_reader_test -${DORIS_TEST_BINARY_DIR}/olap/row_cursor_test -${DORIS_TEST_BINARY_DIR}/olap/skiplist_test -${DORIS_TEST_BINARY_DIR}/olap/serialize_test -${DORIS_TEST_BINARY_DIR}/olap/olap_header_manager_test -${DORIS_TEST_BINARY_DIR}/olap/olap_meta_test -${DORIS_TEST_BINARY_DIR}/olap/delta_writer_test -${DORIS_TEST_BINARY_DIR}/olap/field_info_test -${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/encoding_info_test -${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/ordinal_page_index_test - -# Running routine load test -${DORIS_TEST_BINARY_DIR}/runtime/kafka_consumer_pipe_test -${DORIS_TEST_BINARY_DIR}/runtime/routine_load_task_executor_test +## Running Util Unittest +#${DORIS_TEST_BINARY_DIR}/util/bit_util_test +#${DORIS_TEST_BINARY_DIR}/util/bitmap_test +#${DORIS_TEST_BINARY_DIR}/util/path_trie_test +#${DORIS_TEST_BINARY_DIR}/util/count_down_latch_test +#${DORIS_TEST_BINARY_DIR}/util/lru_cache_util_test +#${DORIS_TEST_BINARY_DIR}/util/filesystem_util_test +#${DORIS_TEST_BINARY_DIR}/util/internal_queue_test +#${DORIS_TEST_BINARY_DIR}/util/cidr_test +#${DORIS_TEST_BINARY_DIR}/util/new_metrics_test +#${DORIS_TEST_BINARY_DIR}/util/doris_metrics_test +#${DORIS_TEST_BINARY_DIR}/util/system_metrics_test +#${DORIS_TEST_BINARY_DIR}/util/core_local_test +#${DORIS_TEST_BINARY_DIR}/util/arena_test +#${DORIS_TEST_BINARY_DIR}/util/types_test +#${DORIS_TEST_BINARY_DIR}/util/json_util_test +#${DORIS_TEST_BINARY_DIR}/util/byte_buffer_test2 +#${DORIS_TEST_BINARY_DIR}/util/uid_util_test +#${DORIS_TEST_BINARY_DIR}/util/aes_util_test +#${DORIS_TEST_BINARY_DIR}/util/string_util_test +#${DORIS_TEST_BINARY_DIR}/util/coding_test +#${DORIS_TEST_BINARY_DIR}/util/faststring_test +# +### Running common Unittest +#${DORIS_TEST_BINARY_DIR}/common/resource_tls_test +# +### Running exprs unit test +#${DORIS_TEST_BINARY_DIR}/exprs/string_functions_test +#${DORIS_TEST_BINARY_DIR}/exprs/json_function_test +# +### Running geo unit test +#${DORIS_TEST_BINARY_DIR}/geo/geo_functions_test +#${DORIS_TEST_BINARY_DIR}/geo/wkt_parse_test +#${DORIS_TEST_BINARY_DIR}/geo/geo_types_test +# +### Running exec unit test +#${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_uncompressed_test +#${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_gzip_test +#${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_bzip_test +#${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_lz4frame_test +#if [ -f ${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_lzop_test ];then +# ${DORIS_TEST_BINARY_DIR}/exec/plain_text_line_reader_lzop_test +#fi +#${DORIS_TEST_BINARY_DIR}/exec/broker_scanner_test +#${DORIS_TEST_BINARY_DIR}/exec/broker_scan_node_test +#${DORIS_TEST_BINARY_DIR}/exec/es_scan_node_test +#${DORIS_TEST_BINARY_DIR}/exec/es_http_scan_node_test +#${DORIS_TEST_BINARY_DIR}/exec/es_predicate_test +#${DORIS_TEST_BINARY_DIR}/exec/es_scan_reader_test +#${DORIS_TEST_BINARY_DIR}/exec/es_query_builder_test +#${DORIS_TEST_BINARY_DIR}/exec/olap_table_info_test +#${DORIS_TEST_BINARY_DIR}/exec/olap_table_sink_test +# +### Running runtime Unittest +#${DORIS_TEST_BINARY_DIR}/runtime/fragment_mgr_test +#${DORIS_TEST_BINARY_DIR}/runtime/decimal_value_test +#${DORIS_TEST_BINARY_DIR}/runtime/datetime_value_test +#${DORIS_TEST_BINARY_DIR}/runtime/large_int_value_test +#${DORIS_TEST_BINARY_DIR}/runtime/string_value_test +#${DORIS_TEST_BINARY_DIR}/runtime/free_list_test +#${DORIS_TEST_BINARY_DIR}/runtime/string_buffer_test +#${DORIS_TEST_BINARY_DIR}/runtime/stream_load_pipe_test +#${DORIS_TEST_BINARY_DIR}/runtime/tablet_writer_mgr_test +#${DORIS_TEST_BINARY_DIR}/runtime/snapshot_loader_test +#${DORIS_TEST_BINARY_DIR}/runtime/user_function_cache_test +#${DORIS_TEST_BINARY_DIR}/runtime/small_file_mgr_test +## Running expr Unittest +# +## Running http +#${DORIS_TEST_BINARY_DIR}/http/metrics_action_test +#${DORIS_TEST_BINARY_DIR}/http/http_utils_test +#${DORIS_TEST_BINARY_DIR}/http/stream_load_test +#${DORIS_TEST_BINARY_DIR}/http/http_client_test +# +## Running OLAPEngine Unittest +#${DORIS_TEST_BINARY_DIR}/olap/bit_field_test +#${DORIS_TEST_BINARY_DIR}/olap/byte_buffer_test +#${DORIS_TEST_BINARY_DIR}/olap/run_length_byte_test +#${DORIS_TEST_BINARY_DIR}/olap/run_length_integer_test +#${DORIS_TEST_BINARY_DIR}/olap/stream_index_test +#${DORIS_TEST_BINARY_DIR}/olap/lru_cache_test +#${DORIS_TEST_BINARY_DIR}/olap/bloom_filter_test +#${DORIS_TEST_BINARY_DIR}/olap/bloom_filter_index_test +#${DORIS_TEST_BINARY_DIR}/olap/row_block_test +#${DORIS_TEST_BINARY_DIR}/olap/comparison_predicate_test +#${DORIS_TEST_BINARY_DIR}/olap/in_list_predicate_test +#${DORIS_TEST_BINARY_DIR}/olap/null_predicate_test +#${DORIS_TEST_BINARY_DIR}/olap/file_helper_test +#${DORIS_TEST_BINARY_DIR}/olap/file_utils_test +#${DORIS_TEST_BINARY_DIR}/olap/delete_handler_test +#${DORIS_TEST_BINARY_DIR}/olap/column_reader_test +#${DORIS_TEST_BINARY_DIR}/olap/row_cursor_test +#${DORIS_TEST_BINARY_DIR}/olap/skiplist_test +#${DORIS_TEST_BINARY_DIR}/olap/serialize_test +#${DORIS_TEST_BINARY_DIR}/olap/olap_header_manager_test +#${DORIS_TEST_BINARY_DIR}/olap/olap_meta_test +#${DORIS_TEST_BINARY_DIR}/olap/delta_writer_test +#${DORIS_TEST_BINARY_DIR}/olap/field_info_test +#${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/encoding_info_test +#${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/ordinal_page_index_test +${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/bitshuffle_page_test +# +## Running routine load test +#${DORIS_TEST_BINARY_DIR}/runtime/kafka_consumer_pipe_test +#${DORIS_TEST_BINARY_DIR}/runtime/routine_load_task_executor_test ## Running agent unittest # Prepare agent testdata diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index 4227f20ae17b5d..bbd1e02f0b704a 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -588,7 +588,7 @@ build_bitshuffle() { arch_flag="" if [ "$arch" == "avx2" ]; then arch_flag="-mavx2" - fi + fi tmp_obj=bitshuffle_${arch}_tmp.o dst_obj=bitshuffle_${arch}.o ${CC:-gcc} $EXTRA_CFLAGS $arch_flag -std=c99 -I$PREFIX/include/lz4/ -O3 -DNDEBUG -fPIC -c \