diff --git a/be/src/olap/field.h b/be/src/olap/field.h index 04167899e718be..7a2c2a43ec8175 100644 --- a/be/src/olap/field.h +++ b/be/src/olap/field.h @@ -57,6 +57,7 @@ class Field { inline void set_to_max(char* buf) const { return _type_info->set_to_max(buf); } inline void set_to_min(char* buf) const { return _type_info->set_to_min(buf); } + inline char* allocate_value_from_arena(Arena* arena) const { return _type_info->allocate_value_from_arena(arena); } inline void agg_update(RowCursorCell* dest, const RowCursorCell& src, MemPool* mem_pool = nullptr) const { _agg_info->update(dest, src, mem_pool); @@ -199,6 +200,10 @@ class Field { _type_info->deep_copy_with_arena(dest, src, arena); } + inline void direct_copy_content(char* dest, const char* src) const { + _type_info->direct_copy(dest, src); + } + // Copy srouce content to destination in index format. template void to_index(DstCellType* dst, const SrcCellType& src) const; diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h index 3735f1e2c8f104..685a9ef1b8ef1c 100644 --- a/be/src/olap/olap_define.h +++ b/be/src/olap/olap_define.h @@ -50,9 +50,12 @@ static const uint64_t OLAP_FIX_HEADER_MAGIC_NUMBER = 0; // 执行be/ce时默认的候选集大小 static constexpr uint32_t OLAP_COMPACTION_DEFAULT_CANDIDATE_SIZE = 10; -// the max length supported for string type +// the max length supported for varchar type static const uint16_t OLAP_STRING_MAX_LENGTH = 65535; +//the max length supported for char type +static const uint16_t OLAP_CHAR_MAX_LENGTH = 255; + static const int32_t PREFERRED_SNAPSHOT_VERSION = 3; // the max bytes for stored string length diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 675ed26d58464b..3c80e29d4b84e3 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -102,7 +102,7 @@ Slice BinaryDictPageBuilder::finish() { Slice data_slice = _data_page_builder->finish(); _buffer.append(data_slice.data, data_slice.size); encode_fixed32_le(&_buffer[0], _encoding_type); - return Slice(_buffer.data(), _buffer.size()); + return Slice(_buffer); } void BinaryDictPageBuilder::reset() { @@ -147,7 +147,6 @@ BinaryDictPageDecoder::BinaryDictPageDecoder(Slice data, const PageDecoderOption _data(data), _options(options), _data_page_decoder(nullptr), - _dict_decoder(options.dict_decoder), _parsed(false), _encoding_type(UNKNOWN_ENCODING) { } @@ -161,7 +160,6 @@ Status BinaryDictPageDecoder::init() { _encoding_type = static_cast(type); _data.remove_prefix(BINARY_DICT_PAGE_HEADER_SIZE); if (_encoding_type == DICT_ENCODING) { - DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr"; _data_page_decoder.reset(new BitShufflePageDecoder(_data, _options)); } else if (_encoding_type == PLAIN_ENCODING) { DCHECK_EQ(_encoding_type, PLAIN_ENCODING); @@ -180,12 +178,21 @@ Status BinaryDictPageDecoder::seek_to_position_in_page(size_t pos) { return _data_page_decoder->seek_to_position_in_page(pos); } +bool BinaryDictPageDecoder::is_dict_encoding() const { + return _encoding_type == DICT_ENCODING; +} + +void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder){ + _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder; +}; + Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { if (_encoding_type == PLAIN_ENCODING) { return _data_page_decoder->next_batch(n, dst); } // dictionary encoding DCHECK(_parsed); + DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr"; if (PREDICT_FALSE(*n == 0)) { *n = 0; return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index e434e99a12d6a5..7951efd7bf03a7 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -116,11 +116,15 @@ class BinaryDictPageDecoder : public PageDecoder { return _data_page_decoder->current_index(); } + bool is_dict_encoding() const; + + void set_dict_decoder(PageDecoder* dict_decoder); + private: Slice _data; PageDecoderOptions _options; std::unique_ptr _data_page_decoder; - BinaryPlainPageDecoder* _dict_decoder; + const BinaryPlainPageDecoder* _dict_decoder = nullptr; bool _parsed; EncodingTypePB _encoding_type; faststring _code_buf; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index c08b39afec204d..9c7c69dae9ae00 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -32,6 +32,7 @@ #include "util/crc32c.h" #include "util/rle_encoding.h" // for RleDecoder #include "util/block_compression.h" +#include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder namespace doris { namespace segment_v2 { @@ -168,6 +169,10 @@ void ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column, _calculate_row_ranges(page_indexes, row_ranges); } +PagePointer ColumnReader::get_dict_page_pointer() const { + return _meta.dict_page(); +} + void ColumnReader::_get_filtered_pages(CondColumn* cond_column, const std::vector& delete_conditions, std::vector* page_indexes) { FieldType type = _type_info->type(); @@ -426,6 +431,24 @@ Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, Pars RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder)); RETURN_IF_ERROR(page->data_decoder->init()); + // lazy init dict_encoding'dict for three reasons + // 1. a column use dictionary encoding still has non-dict-encoded data pages are seeked,load dict when necessary + // 2. ColumnReader which is owned by Segment and Rowset can being alive even when there is no query,it should retain memory as small as possible. + // 3. Iterators of the same column won't repeat load the dict page because of page cache. + if (_reader->encoding_info()->encoding() == DICT_ENCODING) { + BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder; + if (binary_dict_page_decoder->is_dict_encoding()) { + if (_dict_decoder == nullptr) { + PagePointer pp = _reader->get_dict_page_pointer(); + RETURN_IF_ERROR(_reader->read_page(pp, &_dict_page_handle)); + + _dict_decoder.reset(new BinaryPlainPageDecoder(_dict_page_handle.data())); + RETURN_IF_ERROR(_dict_decoder->init()); + } + binary_dict_page_decoder->set_dict_decoder(_dict_decoder.get()); + } + } + page->offset_in_page = 0; return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 6d425cb9ca21f9..af34db2903beef 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -29,6 +29,7 @@ #include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator #include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap #include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges +#include "olap/rowset/segment_v2/page_handle.h" // for PageHandle namespace doris { @@ -85,6 +86,8 @@ class ColumnReader { void get_row_ranges_by_zone_map(CondColumn* cond_column, const std::vector& delete_conditions, RowRanges* row_ranges); + PagePointer get_dict_page_pointer() const; + private: Status _init_ordinal_index(); @@ -189,6 +192,12 @@ class FileColumnIterator : public ColumnIterator { // 3. When _page is null, it means that this reader can not be read. std::unique_ptr _page; + // keep dict page decoder + std::unique_ptr _dict_decoder; + + // keep dict page handle to avoid released + PageHandle _dict_page_handle; + // page iterator used to get next page when current page is finished. // This value will be reset when a new seek is issued OrdinalPageIndexIterator _page_iter; diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index fe8dade03e8b36..4faf37270ae734 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -213,6 +213,14 @@ Status ColumnWriter::write_data() { RETURN_IF_ERROR(_write_data_page(page)); page = page->next; } + // write column dict + if (_encoding_info->encoding() == DICT_ENCODING) { + Slice dict_page; + _page_builder->get_dictionary_page(&dict_page); + std::vector origin_data; + origin_data.push_back(dict_page); + RETURN_IF_ERROR(_write_physical_page(&origin_data, &_dict_page_pp)); + } return Status::OK(); } @@ -240,6 +248,9 @@ void ColumnWriter::write_meta(ColumnMetaPB* meta) { if (_opts.need_zone_map) { _zone_map_pp.to_proto(meta->mutable_zone_map_page()); } + if (_encoding_info->encoding() == DICT_ENCODING) { + _dict_page_pp.to_proto(meta->mutable_dict_page()); + } } // write a page into file and update ordinal index diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 76cd5b4ab506e9..b881c13a3ecd00 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -157,6 +157,7 @@ class ColumnWriter { PagePointer _ordinal_index_pp; PagePointer _zone_map_pp; + PagePointer _dict_page_pp; uint64_t _written_size = 0; }; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp index 909a9cfa4c2ba6..61dd3f25639810 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp @@ -28,9 +28,9 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in options.data_page_size = 0; _page_builder.reset(new BinaryPlainPageBuilder(options)); _field.reset(FieldFactory::create_by_type(_type_info->type())); - _max_string_value = _arena.Allocate(OLAP_STRING_MAX_LENGTH); - _zone_map.min_value = _arena.Allocate(_type_info->size()); - _zone_map.max_value = _arena.Allocate(_type_info->size()); + _zone_map.min_value = _field->allocate_value_from_arena(&_arena); + _zone_map.max_value = _field->allocate_value_from_arena(&_arena); + _reset_zone_map(); } @@ -38,10 +38,10 @@ Status ColumnZoneMapBuilder::add(const uint8_t *vals, size_t count) { if (vals != nullptr) { for (int i = 0; i < count; ++i) { if (_field->compare(_zone_map.min_value, (char *)vals) > 0) { - _field->deep_copy_content(_zone_map.min_value, (const char *)vals, &_arena); + _field->direct_copy_content(_zone_map.min_value, (const char *)vals); } if (_field->compare(_zone_map.max_value, (char *)vals) < 0) { - _field->deep_copy_content(_zone_map.max_value, (const char *)vals, &_arena); + _field->direct_copy_content(_zone_map.max_value, (const char *)vals); } vals += _type_info->size(); if (!_zone_map.has_not_null) { @@ -78,10 +78,6 @@ Status ColumnZoneMapBuilder::flush() { } void ColumnZoneMapBuilder::_reset_zone_map() { - // we should allocate max varchar length and set to max for min value - Slice *min_slice = (Slice *)_zone_map.min_value; - min_slice->data = _max_string_value; - min_slice->size = OLAP_STRING_MAX_LENGTH; _field->set_to_max(_zone_map.min_value); _field->set_to_min(_zone_map.max_value); _zone_map.has_null = false; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/column_zone_map.h index 1fb01bc2d80955..3e2b7ac78f6a12 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.h +++ b/be/src/olap/rowset/segment_v2/column_zone_map.h @@ -73,7 +73,6 @@ class ColumnZoneMapBuilder { std::unique_ptr _field; // memory will be managed by arena ZoneMap _zone_map; - char* _max_string_value; Arena _arena; }; diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp b/be/src/olap/rowset/segment_v2/encoding_info.cpp index 3cfb71525cd417..31539a21681986 100644 --- a/be/src/olap/rowset/segment_v2/encoding_info.cpp +++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp @@ -20,6 +20,7 @@ #include "olap/olap_common.h" #include "olap/rowset/segment_v2/bitshuffle_page.h" #include "olap/rowset/segment_v2/rle_page.h" +#include "olap/rowset/segment_v2/binary_dict_page.h" namespace doris { namespace segment_v2 { @@ -67,6 +68,18 @@ struct TypeEncodingTraits { } }; +template +struct TypeEncodingTraits { + static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { + *builder = new BinaryDictPageBuilder(opts); + return Status::OK(); + } + static Status create_page_decoder(const Slice& data, const PageDecoderOptions& opts, PageDecoder** decoder) { + *decoder = new BinaryDictPageDecoder(data, opts); + return Status::OK(); + } +}; + template struct EncodingTraits : TypeEncodingTraits { static const FieldType type = Type; @@ -122,6 +135,10 @@ EncodingInfoResolver::EncodingInfoResolver() { _add_map(); _add_map(); _add_map(); + _add_map(); + _add_map(); + _add_map(); + _add_map(); _add_map(); _add_map(); _add_map(); diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h index 386afad33283fe..3997e0dc751e81 100644 --- a/be/src/olap/rowset/segment_v2/options.h +++ b/be/src/olap/rowset/segment_v2/options.h @@ -31,7 +31,6 @@ struct PageBuilderOptions { }; struct PageDecoderOptions { - BinaryPlainPageDecoder* dict_decoder = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp index 87ef27b6430d25..36704fcdd9d82d 100644 --- a/be/src/olap/types.cpp +++ b/be/src/olap/types.cpp @@ -27,6 +27,7 @@ TypeInfo::TypeInfo(TypeTraitsClass t) _deep_copy(TypeTraitsClass::deep_copy), _deep_copy_with_arena(TypeTraitsClass::deep_copy_with_arena), _direct_copy(TypeTraitsClass::direct_copy), + _allocate_value_from_arena(TypeTraitsClass::allocate_value_from_arena), _from_string(TypeTraitsClass::from_string), _to_string(TypeTraitsClass::to_string), _set_to_max(TypeTraitsClass::set_to_max), diff --git a/be/src/olap/types.h b/be/src/olap/types.h index 2b63035c512b01..bd381106a02c57 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -64,6 +64,10 @@ class TypeInfo { _direct_copy(dest, src); } + inline char* allocate_value_from_arena(Arena* arena) const { + return _allocate_value_from_arena(arena); + } + OLAPStatus from_string(void* buf, const std::string& scan_key) const { return _from_string(buf, scan_key); } @@ -85,6 +89,7 @@ class TypeInfo { void (*_deep_copy)(void* dest, const void* src, MemPool* mem_pool); void (*_deep_copy_with_arena)(void* dest, const void* src, Arena* arena); void (*_direct_copy)(void* dest, const void* src); + char* (*_allocate_value_from_arena)(Arena* arena); OLAPStatus (*_from_string)(void* buf, const std::string& scan_key); std::string (*_to_string)(const void* src); @@ -213,6 +218,10 @@ struct BaseFieldtypeTraits : public CppTypeTraits { return HashUtil::hash(data, sizeof(CppType), seed); } + static inline char* allocate_value_from_arena(Arena* arena) { + return arena->Allocate(sizeof(CppType)); + } + static std::string to_string(const void* src) { std::stringstream stream; stream << *reinterpret_cast(src); @@ -568,6 +577,13 @@ struct FieldTypeTraits : public BaseFieldtypeTraits(data); return HashUtil::hash(slice->data, slice->size, seed); } + static char* allocate_value_from_arena(Arena* arena) { + char* type_value = arena->Allocate(sizeof(Slice)); + auto slice = reinterpret_cast(type_value); + slice->size = OLAP_CHAR_MAX_LENGTH; + slice->data = arena->Allocate(OLAP_CHAR_MAX_LENGTH); + return type_value; + } }; template<> @@ -594,6 +610,13 @@ struct FieldTypeTraits : public FieldTypeTraits(buf); slice->size = 0; } + static char* allocate_value_from_arena(Arena* arena) { + char* type_value = arena->Allocate(sizeof(Slice)); + auto slice = reinterpret_cast(type_value); + slice->size = OLAP_STRING_MAX_LENGTH; + slice->data = arena->Allocate(OLAP_STRING_MAX_LENGTH); + return type_value; + } }; template<> diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp index 9a25b7a845a816..d082bca5cce939 100644 --- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp @@ -62,8 +62,9 @@ class BinaryDictPageTest : public testing::Test { // decode PageDecoderOptions decoder_options; - decoder_options.dict_decoder = dict_page_decoder.get(); BinaryDictPageDecoder page_decoder(s, decoder_options); + page_decoder.set_dict_decoder(dict_page_decoder.get()); + status = page_decoder.init(); ASSERT_TRUE(status.ok()); ASSERT_EQ(slices.size(), page_decoder.count()); @@ -154,9 +155,9 @@ class BinaryDictPageTest : public testing::Test { // decode PageDecoderOptions decoder_options; - decoder_options.dict_decoder = dict_page_decoder.get(); BinaryDictPageDecoder page_decoder(results[slice_index], decoder_options); status = page_decoder.init(); + page_decoder.set_dict_decoder(dict_page_decoder.get()); ASSERT_TRUE(status.ok()); //check values diff --git a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp index 0ae68fbee123fa..45f02422b87b8a 100644 --- a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp @@ -56,7 +56,7 @@ class BinaryPlainPageTest : public testing::Test { PageDecoderType page_decoder(s, decoder_options); Status status = page_decoder.init(); ASSERT_TRUE(status.ok()); - + //test1 size_t size = 3; diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index 80ce4ffdacd48e..01121ce872361c 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -606,6 +606,247 @@ TEST_F(SegmentReaderWriterTest, TestDefaultValueColumn) { } } +void set_column_value_by_type(FieldType fieldType, int src, char* target, Arena* _arena, size_t _length = 0) { + if (fieldType == OLAP_FIELD_TYPE_CHAR) { + char* src_value = &std::to_string(src)[0]; + int src_len = strlen(src_value); + + auto* dest_slice = (Slice*)target; + dest_slice->size = _length; + dest_slice->data = _arena->Allocate(dest_slice->size); + memcpy(dest_slice->data, src_value, src_len); + memset(dest_slice->data + src_len, 0, dest_slice->size - src_len); + } else if (fieldType == OLAP_FIELD_TYPE_VARCHAR) { + char* src_value = &std::to_string(src)[0]; + int src_len = strlen(src_value); + + auto* dest_slice = (Slice*)target; + dest_slice->size = src_len; + dest_slice->data = _arena->Allocate(src_len); + std::memcpy(dest_slice->data, src_value, src_len); + } else { + *(int*)target = src; + } +} + +TEST_F(SegmentReaderWriterTest, TestStringDict) { + size_t num_rows_per_block = 10; + Arena _arena; + + std::shared_ptr tablet_schema(new TabletSchema()); + tablet_schema->_num_columns = 4; + tablet_schema->_num_key_columns = 3; + tablet_schema->_num_short_key_columns = 2; + tablet_schema->_num_rows_per_row_block = num_rows_per_block; + tablet_schema->_cols.push_back(create_char_key(1)); + tablet_schema->_cols.push_back(create_char_key(2)); + tablet_schema->_cols.push_back(create_varchar_key(3)); + tablet_schema->_cols.push_back(create_varchar_key(4)); + + // segment write + std::string dname = "./ut_dir/segment_test"; + FileUtils::create_dir(dname); + + SegmentWriterOptions opts; + opts.num_rows_per_block = num_rows_per_block; + + std::string fname = dname + "/string_case"; + + SegmentWriter writer(fname, 0, tablet_schema.get(), opts); + auto st = writer.init(10); + ASSERT_TRUE(st.ok()); + + RowCursor row; + auto olap_st = row.init(*tablet_schema); + ASSERT_EQ(OLAP_SUCCESS, olap_st); + + // 0, 1, 2, 3 + // 10, 11, 12, 13 + // 20, 21, 22, 23 + // convert int to string + for (int i = 0; i < 4096; ++i) { + for (int j = 0; j < 4; ++j) { + auto cell = row.cell(j); + cell.set_not_null(); + set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[j]._length); + } + Status status = writer.append_row(row); + ASSERT_TRUE(status.ok()); + } + + uint64_t file_size = 0; + st = writer.finalize(&file_size); + ASSERT_TRUE(st.ok()); + + { + std::shared_ptr segment(new Segment(fname, 0, tablet_schema.get())); + st = segment->open(); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(4096, segment->num_rows()); + Schema schema(*tablet_schema); + + // scan all rows + { + StorageReadOptions read_opts; + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 1024); + + int left = 4096; + int rowid = 0; + + while (left > 0) { + int rows_read = left > 1024 ? 1024 : left; + block.clear(); + st = iter->next_batch(&block); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(rows_read, block.num_rows()); + left -= rows_read; + + for (int j = 0; j < block.schema()->column_ids().size(); ++j) { + auto cid = block.schema()->column_ids()[j]; + auto column_block = block.column_block(j); + for (int i = 0; i < rows_read; ++i) { + int rid = rowid + i; + ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); + const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); + + Slice expect; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast(&expect), &_arena, tablet_schema->_cols[j]._length); + ASSERT_EQ(expect.to_string(), actual->to_string()); + } + } + rowid += rows_read; + } + } + + // test seek, key + { + // lower bound + std::unique_ptr lower_bound(new RowCursor()); + lower_bound->init(*tablet_schema, 1); + { + auto cell = lower_bound->cell(0); + cell.set_not_null(); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 40970, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length); + } + + StorageReadOptions read_opts; + read_opts.key_ranges.emplace_back(lower_bound.get(), false, nullptr, false); + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 100); + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + + // test seek, key (-2, -1) + { + // lower bound + std::unique_ptr lower_bound(new RowCursor()); + lower_bound->init(*tablet_schema, 1); + { + auto cell = lower_bound->cell(0); + cell.set_not_null(); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -2, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length); + } + + std::unique_ptr upper_bound(new RowCursor()); + upper_bound->init(*tablet_schema, 1); + { + auto cell = upper_bound->cell(0); + cell.set_not_null(); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -1, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length); + } + + StorageReadOptions read_opts; + read_opts.key_ranges.emplace_back(lower_bound.get(), false, upper_bound.get(), false); + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 100); + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + + // test char zone_map query hit;should read whole page + { + TCondition condition; + condition.__set_column_name("1"); + condition.__set_condition_op(">"); + std::vector vals = {"100"}; + condition.__set_condition_values(vals); + std::shared_ptr conditions(new Conditions()); + conditions->set_tablet_schema(tablet_schema.get()); + conditions->append_condition(condition); + + StorageReadOptions read_opts; + read_opts.conditions = conditions.get(); + + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 1024); + int left = 4 * 1024; + int rowid = 0; + + while (left > 0) { + int rows_read = left > 1024 ? 1024 : left; + block.clear(); + st = iter->next_batch(&block); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(rows_read, block.num_rows()); + left -= rows_read; + + for (int j = 0; j < block.schema()->column_ids().size(); ++j) { + auto cid = block.schema()->column_ids()[j]; + auto column_block = block.column_block(j); + for (int i = 0; i < rows_read; ++i) { + int rid = rowid + i; + ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); + + const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); + Slice expect; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast(&expect), &_arena, tablet_schema->_cols[j]._length); + ASSERT_EQ(expect.to_string(), actual->to_string()) << "rid:" << rid << ", i:" << i;; + } + } + rowid += rows_read; + } + ASSERT_EQ(4 * 1024, rowid); + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + + // test char zone_map query miss;col < -1 + { + TCondition condition; + condition.__set_column_name("1"); + condition.__set_condition_op("<"); + std::vector vals = {"-2"}; + condition.__set_condition_values(vals); + std::shared_ptr conditions(new Conditions()); + conditions->set_tablet_schema(tablet_schema.get()); + conditions->append_condition(condition); + + StorageReadOptions read_opts; + read_opts.conditions = conditions.get(); + + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 1024); + + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + + } + + FileUtils::remove_all(dname); +} + } } diff --git a/be/test/olap/tablet_schema_helper.h b/be/test/olap/tablet_schema_helper.h index 8e22b33a6b26bf..73ecdf323e3ff0 100644 --- a/be/test/olap/tablet_schema_helper.h +++ b/be/test/olap/tablet_schema_helper.h @@ -55,4 +55,28 @@ TabletColumn create_int_value( return column; } +TabletColumn create_char_key(int32_t id, bool is_nullable = true) { + TabletColumn column; + column._unique_id = id; + column._col_name = std::to_string(id); + column._type = OLAP_FIELD_TYPE_CHAR; + column._is_key = true; + column._is_nullable = is_nullable; + column._length = 8; + column._index_length = 1; + return column; +} + +TabletColumn create_varchar_key(int32_t id, bool is_nullable = true) { + TabletColumn column; + column._unique_id = id; + column._col_name = std::to_string(id); + column._type = OLAP_FIELD_TYPE_VARCHAR; + column._is_key = true; + column._is_nullable = is_nullable; + column._length = 4; + column._index_length = 4; + return column; +} + } diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 4a3f861a05cbbe..8abe0aede06d3e 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -95,7 +95,7 @@ message ColumnMetaPB { optional PagePointerPB zone_map_page = 8; // // dictionary page for DICT_ENCODING - // optional PagePointerPB dict_page = 2; + optional PagePointerPB dict_page = 9; // // bloom filter pages for bloom filter column // repeated PagePointerPB bloom_filter_pages = 3;