From 0835bc13d5abf2d444ba9cf00098657f528ac0b1 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Mon, 9 Sep 2019 13:57:54 +0800 Subject: [PATCH 01/15] v2 segment support string encode(#1766) --- .../rowset/segment_v2/binary_dict_page.cpp | 23 +- .../olap/rowset/segment_v2/encoding_info.cpp | 17 ++ .../olap/rowset/segment_v2/segment_writer.cpp | 3 + be/src/util/arena.cpp | 2 + .../segment_v2/binary_dict_page_test.cpp | 32 +- .../segment_v2/binary_plain_page_test.cpp | 2 +- .../olap/rowset/segment_v2/segment_test.cpp | 278 +++++++++++++++++- be/test/olap/tablet_schema_helper.h | 24 ++ 8 files changed, 334 insertions(+), 47 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index f903547c1dd077..f7d275f94f0130 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -102,7 +102,16 @@ Slice BinaryDictPageBuilder::finish() { Slice data_slice = _data_page_builder->finish(); _buffer.append(data_slice.data, data_slice.size); encode_fixed32_le(&_buffer[0], _encoding_type); - return Slice(_buffer.data(), _buffer.size()); + + if (_encoding_type == DICT_ENCODING) { + size_t dict_offset = _buffer.size(); + Slice dictionary_page; + get_dictionary_page(&dictionary_page); + _buffer.append(dictionary_page.data, dictionary_page.size); + put_fixed32_le(&_buffer, dict_offset); + } + + return Slice(_buffer); } void BinaryDictPageBuilder::reset() { @@ -144,7 +153,7 @@ BinaryDictPageDecoder::BinaryDictPageDecoder(Slice data, const PageDecoderOption _data(data), _options(options), _data_page_decoder(nullptr), - _dict_decoder(options.dict_decoder), + _dict_decoder(nullptr), _parsed(false), _encoding_type(UNKNOWN_ENCODING) { } @@ -158,8 +167,15 @@ Status BinaryDictPageDecoder::init() { _encoding_type = static_cast(type); _data.remove_prefix(BINARY_DICT_PAGE_HEADER_SIZE); if (_encoding_type == DICT_ENCODING) { - DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr"; + size_t dict_offset = decode_fixed32_le((const uint8_t *)&_data[_data.get_size() - sizeof(uint32_t)]) - BINARY_DICT_PAGE_HEADER_SIZE; + size_t dict_size = _data.get_size() - dict_offset - sizeof(uint32_t); + + + Slice dictSlice(&_data[dict_offset], dict_size); + _data.size = dict_offset; + _data_page_decoder.reset(new BitShufflePageDecoder(_data, _options)); + _dict_decoder.reset(new BinaryPlainPageDecoder(dictSlice)); } else if (_encoding_type == PLAIN_ENCODING) { DCHECK_EQ(_encoding_type, PLAIN_ENCODING); // use plain page decoder to decode data @@ -170,6 +186,7 @@ Status BinaryDictPageDecoder::init() { } RETURN_IF_ERROR(_data_page_decoder->init()); + RETURN_IF_ERROR(_dict_decoder->init()); _parsed = true; return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp b/be/src/olap/rowset/segment_v2/encoding_info.cpp index 7486b8f56a58ef..276f28677d4f5f 100644 --- a/be/src/olap/rowset/segment_v2/encoding_info.cpp +++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp @@ -19,6 +19,7 @@ #include "olap/olap_common.h" #include "olap/rowset/segment_v2/bitshuffle_page.h" +#include "olap/rowset/segment_v2/binary_dict_page.h" namespace doris { namespace segment_v2 { @@ -54,6 +55,18 @@ struct TypeEncodingTraits { } }; +template +struct TypeEncodingTraits { + static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { + *builder = new BinaryDictPageBuilder(opts); + return Status::OK(); + } + static Status create_page_decoder(const Slice& data, const PageDecoderOptions& opts, PageDecoder** decoder) { + *decoder = new BinaryDictPageDecoder(data, opts); + return Status::OK(); + } +}; + template struct EncodingTraits : TypeEncodingTraits { static const FieldType type = Type; @@ -109,6 +122,10 @@ EncodingInfoResolver::EncodingInfoResolver() { _add_map(); _add_map(); _add_map(); + _add_map(); + _add_map(); + _add_map(); + _add_map(); } EncodingInfoResolver::~EncodingInfoResolver() { diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 459e973ab669b0..542319ad37dd01 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -60,6 +60,9 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) { DCHECK(type_info != nullptr); ColumnWriterOptions opts; + if (column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_VARCHAR) { + opts.encoding_type = DICT_ENCODING; + } opts.compression_type = segment_v2::CompressionTypePB::LZ4F; // now we create zone map for key columns if (column.is_key()) { diff --git a/be/src/util/arena.cpp b/be/src/util/arena.cpp index cf72db82b89d88..3f83a829d6378d 100644 --- a/be/src/util/arena.cpp +++ b/be/src/util/arena.cpp @@ -4,6 +4,7 @@ #include "util/arena.h" #include +#include "string.h" namespace doris { @@ -59,6 +60,7 @@ char* Arena::AllocateAligned(size_t bytes) { char* Arena::AllocateNewBlock(size_t block_bytes) { char* result = new char[block_bytes]; + memset(result, 0, block_bytes); blocks_.push_back(result); memory_usage_.store(MemoryUsage() + block_bytes + sizeof(char*), std::memory_order_relaxed); diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp index f918e70b22024e..b17140b7a2677a 100644 --- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp @@ -48,23 +48,10 @@ class BinaryDictPageTest : public testing::Test { ASSERT_EQ(slices.size(), page_builder.count()); ASSERT_FALSE(page_builder.is_page_full()); - // construct dict page - Slice dict_slice; - Status status = page_builder.get_dictionary_page(&dict_slice); - ASSERT_TRUE(status.ok()); - PageDecoderOptions dict_decoder_options; - std::shared_ptr dict_page_decoder( - new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); - status = dict_page_decoder->init(); - ASSERT_TRUE(status.ok()); - // because every slice is unique - ASSERT_EQ(slices.size(), dict_page_decoder->count()); - // decode PageDecoderOptions decoder_options; - decoder_options.dict_decoder = dict_page_decoder; BinaryDictPageDecoder page_decoder(s, decoder_options); - status = page_decoder.init(); + Status status = page_decoder.init(); ASSERT_TRUE(status.ok()); ASSERT_EQ(slices.size(), page_decoder.count()); @@ -131,13 +118,7 @@ class BinaryDictPageTest : public testing::Test { page_builder.reset(); page_start_ids.push_back(count); - Slice dict_slice; - Status status = page_builder.get_dictionary_page(&dict_slice); - size_t data_size = total_size; - total_size += dict_slice.size; - ASSERT_TRUE(status.ok()); - LOG(INFO) << "total size:" << total_size << ", data size:" << data_size - << ", dict size:" << dict_slice.size + LOG(INFO) << "total size:" << total_size << " result page size:" << results.size(); // validate @@ -145,18 +126,11 @@ class BinaryDictPageTest : public testing::Test { srand(time(nullptr)); for (int i = 0; i < 100; ++i) { int slice_index = random() % results.size(); - //int slice_index = 1; - PageDecoderOptions dict_decoder_options; - std::shared_ptr dict_page_decoder( - new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); - status = dict_page_decoder->init(); - ASSERT_TRUE(status.ok()); // decode PageDecoderOptions decoder_options; - decoder_options.dict_decoder = dict_page_decoder; BinaryDictPageDecoder page_decoder(results[slice_index], decoder_options); - status = page_decoder.init(); + Status status = page_decoder.init(); ASSERT_TRUE(status.ok()); //check values diff --git a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp index 0ae68fbee123fa..45f02422b87b8a 100644 --- a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp @@ -56,7 +56,7 @@ class BinaryPlainPageTest : public testing::Test { PageDecoderType page_decoder(s, decoder_options); Status status = page_decoder.init(); ASSERT_TRUE(status.ok()); - + //test1 size_t size = 3; diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index 2636d258a47f2a..8a64aea5f0b620 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -42,6 +42,24 @@ class SegmentReaderWriterTest : public testing::Test { } }; +void set_column_value_by_type(FieldType fieldType, int src, char* target, size_t _length = 0) { + if (fieldType == OLAP_FIELD_TYPE_CHAR) { + char* src_value = &std::to_string(src)[0]; + int src_len = strlen(src_value); + + auto* dest_slice = (Slice*)target; + dest_slice->size = _length; + dest_slice->data = new char[dest_slice->size]; + memcpy(dest_slice->data, src_value, src_len); + memset(dest_slice->data + src_len, 0, dest_slice->size - src_len); + } else if (fieldType == OLAP_FIELD_TYPE_VARCHAR) { + Slice* slice = new Slice(*new string(&std::to_string(src)[0])); + std::memcpy(target, slice, sizeof(Slice)); + } else { + *(int*)target = src; + } +} + TEST_F(SegmentReaderWriterTest, normal) { size_t num_rows_per_block = 10; @@ -52,8 +70,8 @@ TEST_F(SegmentReaderWriterTest, normal) { tablet_schema->_num_short_key_columns = 2; tablet_schema->_num_rows_per_row_block = num_rows_per_block; tablet_schema->_cols.push_back(create_int_key(1)); - tablet_schema->_cols.push_back(create_int_key(2)); - tablet_schema->_cols.push_back(create_int_key(3)); + tablet_schema->_cols.push_back(create_char_key(2)); + tablet_schema->_cols.push_back(create_varchar_key(3)); tablet_schema->_cols.push_back(create_int_value(4)); // segment write @@ -79,7 +97,7 @@ TEST_F(SegmentReaderWriterTest, normal) { for (int j = 0; j < 4; ++j) { auto cell = row.cell(j); cell.set_not_null(); - *(int*)cell.mutable_cell_ptr() = i * 10 + j; + set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[j]._length); } writer.append_row(row); } @@ -119,7 +137,14 @@ TEST_F(SegmentReaderWriterTest, normal) { for (int i = 0; i < rows_read; ++i) { int rid = rowid + i; ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); - ASSERT_EQ(rid * 10 + cid, *(int*)column_block.cell_ptr(i)); + if (j == 1 || j == 2) { + char* expect_value = new char[sizeof(Slice)]; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, expect_value, tablet_schema->_cols[j]._length); + const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); + ASSERT_EQ(((Slice*)expect_value)->to_string(), actual->to_string()); + } else { + ASSERT_EQ(rid * 10 + cid, *(int*)column_block.cell_ptr(i)); + } } } rowid += rows_read; @@ -137,8 +162,7 @@ TEST_F(SegmentReaderWriterTest, normal) { } { auto cell = lower_bound->cell(1); - cell.set_not_null(); - *(int*)cell.mutable_cell_ptr() = 100; + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 100, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[1]._length); } // upper bound @@ -162,6 +186,14 @@ TEST_F(SegmentReaderWriterTest, normal) { for (int i = 0; i < 11; ++i) { ASSERT_EQ(100 + i * 10, *(int*)column_block.cell_ptr(i)); } + + auto column_char_block = block.column_block(1); + for (int i = 0; i < 11; ++i) { + const Slice* actual = reinterpret_cast(column_char_block.cell_ptr(i)); + char* except_value = new char[sizeof(Slice)]; + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 100 + i * 10 + 1, except_value, 8); + ASSERT_EQ(((Slice*)except_value)->to_string(), actual->to_string()); + } } // test seek, key { @@ -221,11 +253,11 @@ TEST_F(SegmentReaderWriterTest, TestZoneMap) { std::shared_ptr tablet_schema(new TabletSchema()); tablet_schema->_num_columns = 4; tablet_schema->_num_key_columns = 3; - tablet_schema->_num_short_key_columns = 2; + tablet_schema->_num_short_key_columns = 1; tablet_schema->_num_rows_per_row_block = num_rows_per_block; - tablet_schema->_cols.push_back(create_int_key(1)); + tablet_schema->_cols.push_back(create_char_key(1)); tablet_schema->_cols.push_back(create_int_key(2)); - tablet_schema->_cols.push_back(create_int_key(3)); + tablet_schema->_cols.push_back(create_varchar_key(3)); tablet_schema->_cols.push_back(create_int_value(4)); // segment write @@ -253,7 +285,7 @@ TEST_F(SegmentReaderWriterTest, TestZoneMap) { for (int j = 0; j < 4; ++j) { auto cell = row.cell(j); cell.set_not_null(); - *(int*)cell.mutable_cell_ptr() = i * 10 + j; + set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[j]._length); } writer.append_row(row); } @@ -305,7 +337,15 @@ TEST_F(SegmentReaderWriterTest, TestZoneMap) { for (int i = 0; i < rows_read; ++i) { int rid = rowid + i; ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); - ASSERT_EQ(rid * 10 + cid, *(int*)column_block.cell_ptr(i)) << "rid:" << rid << ", i:" << i; + + if (j == 0 || j == 2) { + char* expect_value = new char[sizeof(Slice)]; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, expect_value, tablet_schema->_cols[j]._length); + const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); + ASSERT_EQ(((Slice*)expect_value)->to_string(), actual->to_string()) << "rid:" << rid << ", i:" << i; + } else { + ASSERT_EQ(rid * 10 + cid, *(int*)column_block.cell_ptr(i)) << "rid:" << rid << ", i:" << i; + } } } rowid += rows_read; @@ -316,12 +356,222 @@ TEST_F(SegmentReaderWriterTest, TestZoneMap) { ASSERT_EQ(0, block.num_rows()); } } + + // reader with condition;test char + { + std::shared_ptr segment(new Segment(fname, 0, tablet_schema.get())); + st = segment->open(); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(64 * 1024, segment->num_rows()); + Schema schema(*tablet_schema); + // scan all rows + { + TCondition condition; + condition.__set_column_name("1"); + condition.__set_condition_op("<"); + + char* target = new char[sizeof(Slice)]; + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 100, target, tablet_schema->_cols[0]._length); + Slice* value = reinterpret_cast(target); + std::vector vals = {value->to_string()}; + condition.__set_condition_values(vals); + std::shared_ptr conditions(new Conditions()); + conditions->set_tablet_schema(tablet_schema.get()); + conditions->append_condition(condition); + + StorageReadOptions read_opts; + read_opts.conditions = conditions.get(); + + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 1024); + + // only first page will be read because of zone map + int left = 16 * 1024; + + int rowid = 0; + while (left > 0) { + int rows_read = left > 1024 ? 1024 : left; + block.clear(); + st = iter->next_batch(&block); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(rows_read, block.num_rows()); + left -= rows_read; + + for (int j = 0; j < block.schema()->column_ids().size(); ++j) { + auto cid = block.schema()->column_ids()[j]; + auto column_block = block.column_block(j); + for (int i = 0; i < rows_read; ++i) { + int rid = rowid + i; + ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); + if (j == 0 || j == 2) { + char* expect_value = new char[sizeof(Slice)]; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, expect_value, tablet_schema->_cols[j]._length); + const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); + ASSERT_EQ(((Slice*)expect_value)->to_string(), actual->to_string()) << "rid:" << rid << ", i:" << i; + } else { + ASSERT_EQ(rid * 10 + cid, *(int*)column_block.cell_ptr(i)) << "rid:" << rid << ", i:" << i; + } + } + } + rowid += rows_read; + } + ASSERT_EQ(16 * 1024, rowid); + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + FileUtils::remove_all(dname); -} + } +} // end of test zonemap -} -} +TEST_F(SegmentReaderWriterTest, TestStringDict) { + size_t num_rows_per_block = 10; + + std::shared_ptr tablet_schema(new TabletSchema()); + tablet_schema->_num_columns = 4; + tablet_schema->_num_key_columns = 3; + tablet_schema->_num_short_key_columns = 2; + tablet_schema->_num_rows_per_row_block = num_rows_per_block; + tablet_schema->_cols.push_back(create_char_key(1)); + tablet_schema->_cols.push_back(create_char_key(2)); + tablet_schema->_cols.push_back(create_char_key(3)); + tablet_schema->_cols.push_back(create_char_key(4)); + + // segment write + std::string dname = "./ut_dir/segment_test"; + FileUtils::create_dir(dname); + + SegmentWriterOptions opts; + opts.num_rows_per_block = num_rows_per_block; + + std::string fname = dname + "/string_case"; + + SegmentWriter writer2(fname, 0, tablet_schema.get(), opts); + auto st = writer2.init(10); + ASSERT_TRUE(st.ok()); + + RowCursor row; + auto olap_st = row.init(*tablet_schema); + ASSERT_EQ(OLAP_SUCCESS, olap_st); + + // 0, 1, 2, 3 + // 10, 11, 12, 13 + // 20, 21, 22, 23 + // + // 64k int will generate 4 pages + for (int i = 0; i < 4096; ++i) { + for (int j = 0; j < 4; ++j) { + auto cell = row.cell(j); + cell.set_not_null(); + Slice* slice = new Slice(*new string(&std::to_string(i * 10 + j)[0])); + std::memcpy(cell.mutable_cell_ptr(), slice, sizeof(Slice)); + } + Status status = writer2.append_row(row); + ASSERT_TRUE(status.ok()); + } + + uint32_t file_size = 0; + st = writer2.finalize(&file_size); + ASSERT_TRUE(st.ok()); + + { + std::shared_ptr segment(new Segment(fname, 0, tablet_schema.get())); + st = segment->open(); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(4096, segment->num_rows()); + Schema schema(*tablet_schema); + + // scan all rows + { + StorageReadOptions read_opts; + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 1024); + + int left = 4096; + int rowid = 0; + + while (left > 0) { + int rows_read = left > 1024 ? 1024 : left; + block.clear(); + st = iter->next_batch(&block); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(rows_read, block.num_rows()); + left -= rows_read; + + for (int j = 0; j < block.schema()->column_ids().size(); ++j) { + auto cid = block.schema()->column_ids()[j]; + auto column_block = block.column_block(j); + for (int i = 0; i < rows_read; ++i) { + int rid = rowid + i; + ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); + const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); + ASSERT_EQ(&std::to_string(rid * 10 + cid)[0], actual->to_string()); + } + } + rowid += rows_read; + } + } + + // test seek, key + { + // lower bound + std::unique_ptr lower_bound(new RowCursor()); + lower_bound->init(*tablet_schema, 1); + { + auto cell = lower_bound->cell(0); + cell.set_not_null(); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 40970, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[0]._length); + } + + StorageReadOptions read_opts; + read_opts.key_ranges.emplace_back(lower_bound.get(), false, nullptr, false); + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 100); + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + + // test seek, key (-2, -1) + { + // lower bound + std::unique_ptr lower_bound(new RowCursor()); + lower_bound->init(*tablet_schema, 1); + { + auto cell = lower_bound->cell(0); + cell.set_not_null(); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -2, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[0]._length); + } + + std::unique_ptr upper_bound(new RowCursor()); + upper_bound->init(*tablet_schema, 1); + { + auto cell = upper_bound->cell(0); + cell.set_not_null(); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -1, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[0]._length); + } + + StorageReadOptions read_opts; + read_opts.key_ranges.emplace_back(lower_bound.get(), false, upper_bound.get(), false); + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 100); + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + + } + + FileUtils::remove_all(dname); +} // end of string dict +} // end of segment v2 namespace +} // end of doris namespace int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/be/test/olap/tablet_schema_helper.h b/be/test/olap/tablet_schema_helper.h index 01f43bc0eb7a6e..886bef52affd0b 100644 --- a/be/test/olap/tablet_schema_helper.h +++ b/be/test/olap/tablet_schema_helper.h @@ -51,4 +51,28 @@ TabletColumn create_int_value( return column; } +TabletColumn create_char_key(int32_t id, bool is_nullable = true) { + TabletColumn column; + column._unique_id = id; + column._col_name = std::to_string(id); + column._type = OLAP_FIELD_TYPE_CHAR; + column._is_key = true; + column._is_nullable = is_nullable; + column._length = 8; + column._index_length = 1; + return column; +} + +TabletColumn create_varchar_key(int32_t id, bool is_nullable = true) { + TabletColumn column; + column._unique_id = id; + column._col_name = std::to_string(id); + column._type = OLAP_FIELD_TYPE_VARCHAR; + column._is_key = true; + column._is_nullable = is_nullable; + column._length = 4; + column._index_length = 4; + return column; +} + } From 23807322fad258f370b660feef94f988f500c143 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Tue, 17 Sep 2019 14:20:05 +0800 Subject: [PATCH 02/15] segment test using uint64 --- be/test/olap/rowset/segment_v2/segment_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index 1ffd6f5740682a..c9d2df4863e4fe 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -516,8 +516,8 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { std::string fname = dname + "/string_case"; - SegmentWriter writer2(fname, 0, tablet_schema.get(), opts); - auto st = writer2.init(10); + SegmentWriter writer(fname, 0, tablet_schema.get(), opts); + auto st = writer.init(10); ASSERT_TRUE(st.ok()); RowCursor row; @@ -536,12 +536,12 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { Slice* slice = new Slice(*new string(&std::to_string(i * 10 + j)[0])); std::memcpy(cell.mutable_cell_ptr(), slice, sizeof(Slice)); } - Status status = writer2.append_row(row); + Status status = writer.append_row(row); ASSERT_TRUE(status.ok()); } - uint32_t file_size = 0; - st = writer2.finalize(&file_size); + uint64_t file_size = 0; + st = writer.finalize(&file_size); ASSERT_TRUE(st.ok()); { From 0a40fa8192ce6adf4d64e137d770fb1edf0c0cb4 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Wed, 18 Sep 2019 10:25:37 +0800 Subject: [PATCH 03/15] update commit --- .../rowset/segment_v2/binary_dict_page.cpp | 29 ++++++----------- .../olap/rowset/segment_v2/binary_dict_page.h | 4 +++ .../olap/rowset/segment_v2/column_reader.cpp | 26 +++++++++++++++ be/src/olap/rowset/segment_v2/column_reader.h | 6 ++++ .../olap/rowset/segment_v2/column_writer.cpp | 11 +++++++ be/src/olap/rowset/segment_v2/column_writer.h | 1 + .../segment_v2/binary_dict_page_test.cpp | 32 +++++++++++++++++-- gensrc/proto/segment_v2.proto | 2 +- 8 files changed, 88 insertions(+), 23 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 773ff77207ff33..9af22b32a26cb5 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -102,15 +102,6 @@ Slice BinaryDictPageBuilder::finish() { Slice data_slice = _data_page_builder->finish(); _buffer.append(data_slice.data, data_slice.size); encode_fixed32_le(&_buffer[0], _encoding_type); - - if (_encoding_type == DICT_ENCODING) { - size_t dict_offset = _buffer.size(); - Slice dictionary_page; - get_dictionary_page(&dictionary_page); - _buffer.append(dictionary_page.data, dictionary_page.size); - put_fixed32_le(&_buffer, dict_offset); - } - return Slice(_buffer); } @@ -157,7 +148,7 @@ BinaryDictPageDecoder::BinaryDictPageDecoder(Slice data, const PageDecoderOption _data(data), _options(options), _data_page_decoder(nullptr), - _dict_decoder(nullptr), + _dict_decoder(options.dict_decoder), _parsed(false), _encoding_type(UNKNOWN_ENCODING) { } @@ -171,15 +162,7 @@ Status BinaryDictPageDecoder::init() { _encoding_type = static_cast(type); _data.remove_prefix(BINARY_DICT_PAGE_HEADER_SIZE); if (_encoding_type == DICT_ENCODING) { - size_t dict_offset = decode_fixed32_le((const uint8_t *)&_data[_data.get_size() - sizeof(uint32_t)]) - BINARY_DICT_PAGE_HEADER_SIZE; - size_t dict_size = _data.get_size() - dict_offset - sizeof(uint32_t); - - - Slice dictSlice(&_data[dict_offset], dict_size); - _data.size = dict_offset; - _data_page_decoder.reset(new BitShufflePageDecoder(_data, _options)); - _dict_decoder.reset(new BinaryPlainPageDecoder(dictSlice)); } else if (_encoding_type == PLAIN_ENCODING) { DCHECK_EQ(_encoding_type, PLAIN_ENCODING); _data_page_decoder.reset(new BinaryPlainPageDecoder(_data, _options)); @@ -189,7 +172,6 @@ Status BinaryDictPageDecoder::init() { } RETURN_IF_ERROR(_data_page_decoder->init()); - RETURN_IF_ERROR(_dict_decoder->init()); _parsed = true; return Status::OK(); } @@ -198,12 +180,21 @@ Status BinaryDictPageDecoder::seek_to_position_in_page(size_t pos) { return _data_page_decoder->seek_to_position_in_page(pos); } +bool BinaryDictPageDecoder::is_dict_encoding() { + return _encoding_type == DICT_ENCODING; +} + +void BinaryDictPageDecoder::set_dict_decoder(std::shared_ptr dict_decoder){ + _dict_decoder = dict_decoder; +}; + Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { if (_encoding_type == PLAIN_ENCODING) { return _data_page_decoder->next_batch(n, dst); } // dictionary encoding DCHECK(_parsed); + DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr"; if (PREDICT_FALSE(*n == 0)) { *n = 0; return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index a37ec7ff584be3..24ea2ba84b297e 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -116,6 +116,10 @@ class BinaryDictPageDecoder : public PageDecoder { return _data_page_decoder->current_index(); } + bool is_dict_encoding(); + + void set_dict_decoder(std::shared_ptr dict_decoder); + private: Slice _data; PageDecoderOptions _options; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index b5d803870e51c8..a0c41f74a55277 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -32,6 +32,7 @@ #include "util/crc32c.h" #include "util/rle_encoding.h" // for RleDecoder #include "util/block_compression.h" +#include "binary_dict_page.h" namespace doris { namespace segment_v2 { @@ -166,6 +167,24 @@ void ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column, RowRanges _calculate_row_ranges(page_indexes, row_ranges); } +Status ColumnReader::get_dict_page_decoder(BinaryDictPageDecoder* binaryDictPageDecoder) { + if (_column_dict_page_decoder == nullptr) { + PagePointer pp = _meta.dict_page(); + PageHandle ph; + RETURN_IF_ERROR(read_page(pp, &ph)); + + Slice dict_data = ph.data(); + + std::shared_ptr dict_page_decoder( + new BinaryPlainPageDecoder(dict_data)); + RETURN_IF_ERROR(dict_page_decoder->init()); + + _column_dict_page_decoder = dict_page_decoder; + } + binaryDictPageDecoder->set_dict_decoder(_column_dict_page_decoder); + return Status::OK(); +} + void ColumnReader::_get_filtered_pages(CondColumn* cond_column, std::vector* page_indexes) { FieldType type = _type_info->type(); const std::vector& zone_maps = _column_zone_map->get_column_zone_map(); @@ -411,6 +430,13 @@ Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, Pars RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder)); RETURN_IF_ERROR(page->data_decoder->init()); + if (_reader->encoding_info()->encoding() == DICT_ENCODING) { + BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder; + if (binary_dict_page_decoder->is_dict_encoding()) { + RETURN_IF_ERROR(_reader->get_dict_page_decoder(binary_dict_page_decoder)); + } + } + page->offset_in_page = 0; return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 7f50f870666a8f..43b53dc306ce40 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -28,6 +28,7 @@ #include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator #include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap #include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges +#include "binary_dict_page.h" namespace doris { @@ -79,6 +80,8 @@ class ColumnReader { bool has_zone_map() { return _meta.has_zone_map_page(); } void get_row_ranges_by_zone_map(CondColumn* cond_column, RowRanges* row_ranges); + Status get_dict_page_decoder(BinaryDictPageDecoder* opts); + private: Status _init_ordinal_index(); @@ -103,6 +106,9 @@ class ColumnReader { // column zone map info std::unique_ptr _column_zone_map; + + // keep dict page + std::shared_ptr _column_dict_page_decoder; }; // Base iterator to read one column data diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index fe8dade03e8b36..4faf37270ae734 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -213,6 +213,14 @@ Status ColumnWriter::write_data() { RETURN_IF_ERROR(_write_data_page(page)); page = page->next; } + // write column dict + if (_encoding_info->encoding() == DICT_ENCODING) { + Slice dict_page; + _page_builder->get_dictionary_page(&dict_page); + std::vector origin_data; + origin_data.push_back(dict_page); + RETURN_IF_ERROR(_write_physical_page(&origin_data, &_dict_page_pp)); + } return Status::OK(); } @@ -240,6 +248,9 @@ void ColumnWriter::write_meta(ColumnMetaPB* meta) { if (_opts.need_zone_map) { _zone_map_pp.to_proto(meta->mutable_zone_map_page()); } + if (_encoding_info->encoding() == DICT_ENCODING) { + _dict_page_pp.to_proto(meta->mutable_dict_page()); + } } // write a page into file and update ordinal index diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 76cd5b4ab506e9..b881c13a3ecd00 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -157,6 +157,7 @@ class ColumnWriter { PagePointer _ordinal_index_pp; PagePointer _zone_map_pp; + PagePointer _dict_page_pp; uint64_t _written_size = 0; }; diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp index b17140b7a2677a..f918e70b22024e 100644 --- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp @@ -48,10 +48,23 @@ class BinaryDictPageTest : public testing::Test { ASSERT_EQ(slices.size(), page_builder.count()); ASSERT_FALSE(page_builder.is_page_full()); + // construct dict page + Slice dict_slice; + Status status = page_builder.get_dictionary_page(&dict_slice); + ASSERT_TRUE(status.ok()); + PageDecoderOptions dict_decoder_options; + std::shared_ptr dict_page_decoder( + new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); + status = dict_page_decoder->init(); + ASSERT_TRUE(status.ok()); + // because every slice is unique + ASSERT_EQ(slices.size(), dict_page_decoder->count()); + // decode PageDecoderOptions decoder_options; + decoder_options.dict_decoder = dict_page_decoder; BinaryDictPageDecoder page_decoder(s, decoder_options); - Status status = page_decoder.init(); + status = page_decoder.init(); ASSERT_TRUE(status.ok()); ASSERT_EQ(slices.size(), page_decoder.count()); @@ -118,7 +131,13 @@ class BinaryDictPageTest : public testing::Test { page_builder.reset(); page_start_ids.push_back(count); - LOG(INFO) << "total size:" << total_size + Slice dict_slice; + Status status = page_builder.get_dictionary_page(&dict_slice); + size_t data_size = total_size; + total_size += dict_slice.size; + ASSERT_TRUE(status.ok()); + LOG(INFO) << "total size:" << total_size << ", data size:" << data_size + << ", dict size:" << dict_slice.size << " result page size:" << results.size(); // validate @@ -126,11 +145,18 @@ class BinaryDictPageTest : public testing::Test { srand(time(nullptr)); for (int i = 0; i < 100; ++i) { int slice_index = random() % results.size(); + //int slice_index = 1; + PageDecoderOptions dict_decoder_options; + std::shared_ptr dict_page_decoder( + new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); + status = dict_page_decoder->init(); + ASSERT_TRUE(status.ok()); // decode PageDecoderOptions decoder_options; + decoder_options.dict_decoder = dict_page_decoder; BinaryDictPageDecoder page_decoder(results[slice_index], decoder_options); - Status status = page_decoder.init(); + status = page_decoder.init(); ASSERT_TRUE(status.ok()); //check values diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 4a3f861a05cbbe..8abe0aede06d3e 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -95,7 +95,7 @@ message ColumnMetaPB { optional PagePointerPB zone_map_page = 8; // // dictionary page for DICT_ENCODING - // optional PagePointerPB dict_page = 2; + optional PagePointerPB dict_page = 9; // // bloom filter pages for bloom filter column // repeated PagePointerPB bloom_filter_pages = 3; From 61f286807351588e6470cf332930d694d31a2846 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Thu, 19 Sep 2019 19:37:27 +0800 Subject: [PATCH 04/15] update zone map --- be/src/olap/rowset/segment_v2/column_zone_map.cpp | 2 ++ be/src/util/arena.cpp | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp index 909a9cfa4c2ba6..66aed086c8664b 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp @@ -82,6 +82,8 @@ void ColumnZoneMapBuilder::_reset_zone_map() { Slice *min_slice = (Slice *)_zone_map.min_value; min_slice->data = _max_string_value; min_slice->size = OLAP_STRING_MAX_LENGTH; + Slice *max_slice = (Slice *)_zone_map.max_value; + max_slice->size = 0; _field->set_to_max(_zone_map.min_value); _field->set_to_min(_zone_map.max_value); _zone_map.has_null = false; diff --git a/be/src/util/arena.cpp b/be/src/util/arena.cpp index 3f83a829d6378d..cf72db82b89d88 100644 --- a/be/src/util/arena.cpp +++ b/be/src/util/arena.cpp @@ -4,7 +4,6 @@ #include "util/arena.h" #include -#include "string.h" namespace doris { @@ -60,7 +59,6 @@ char* Arena::AllocateAligned(size_t bytes) { char* Arena::AllocateNewBlock(size_t block_bytes) { char* result = new char[block_bytes]; - memset(result, 0, block_bytes); blocks_.push_back(result); memory_usage_.store(MemoryUsage() + block_bytes + sizeof(char*), std::memory_order_relaxed); From 12e31efc13c909dcf31c7feebf67b9486223d618 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Mon, 23 Sep 2019 17:35:25 +0800 Subject: [PATCH 05/15] update commit 0924: keep dict page handle in FileColumnIterator; --- .../rowset/segment_v2/binary_dict_page.cpp | 5 ++-- .../olap/rowset/segment_v2/binary_dict_page.h | 4 +-- .../olap/rowset/segment_v2/column_reader.cpp | 30 ++++++++----------- be/src/olap/rowset/segment_v2/column_reader.h | 13 ++++---- be/src/olap/rowset/segment_v2/options.h | 1 - .../segment_v2/binary_dict_page_test.cpp | 11 ++++--- 6 files changed, 30 insertions(+), 34 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 9af22b32a26cb5..bb4b69644d8bc9 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -148,7 +148,6 @@ BinaryDictPageDecoder::BinaryDictPageDecoder(Slice data, const PageDecoderOption _data(data), _options(options), _data_page_decoder(nullptr), - _dict_decoder(options.dict_decoder), _parsed(false), _encoding_type(UNKNOWN_ENCODING) { } @@ -184,8 +183,8 @@ bool BinaryDictPageDecoder::is_dict_encoding() { return _encoding_type == DICT_ENCODING; } -void BinaryDictPageDecoder::set_dict_decoder(std::shared_ptr dict_decoder){ - _dict_decoder = dict_decoder; +void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder){ + _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder; }; Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index 24ea2ba84b297e..e7d4ff78742abb 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -118,13 +118,13 @@ class BinaryDictPageDecoder : public PageDecoder { bool is_dict_encoding(); - void set_dict_decoder(std::shared_ptr dict_decoder); + void set_dict_decoder(PageDecoder* dict_decoder); private: Slice _data; PageDecoderOptions _options; std::unique_ptr _data_page_decoder; - std::shared_ptr _dict_decoder; + BinaryPlainPageDecoder* _dict_decoder; bool _parsed; EncodingTypePB _encoding_type; faststring _code_buf; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index a0c41f74a55277..7da981f9646914 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -167,22 +167,8 @@ void ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column, RowRanges _calculate_row_ranges(page_indexes, row_ranges); } -Status ColumnReader::get_dict_page_decoder(BinaryDictPageDecoder* binaryDictPageDecoder) { - if (_column_dict_page_decoder == nullptr) { - PagePointer pp = _meta.dict_page(); - PageHandle ph; - RETURN_IF_ERROR(read_page(pp, &ph)); - - Slice dict_data = ph.data(); - - std::shared_ptr dict_page_decoder( - new BinaryPlainPageDecoder(dict_data)); - RETURN_IF_ERROR(dict_page_decoder->init()); - - _column_dict_page_decoder = dict_page_decoder; - } - binaryDictPageDecoder->set_dict_decoder(_column_dict_page_decoder); - return Status::OK(); +PagePointer ColumnReader::get_dict_page_pointer() { + return _meta.dict_page(); } void ColumnReader::_get_filtered_pages(CondColumn* cond_column, std::vector* page_indexes) { @@ -433,7 +419,17 @@ Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, Pars if (_reader->encoding_info()->encoding() == DICT_ENCODING) { BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder; if (binary_dict_page_decoder->is_dict_encoding()) { - RETURN_IF_ERROR(_reader->get_dict_page_decoder(binary_dict_page_decoder)); + if (_dict_decoder == nullptr) { + PagePointer pp = _reader->get_dict_page_pointer(); + RETURN_IF_ERROR(_reader->read_page(pp, &_dict_page_handle)); + + BinaryPlainPageDecoder* dict_decoder = new BinaryPlainPageDecoder(_dict_page_handle.data()); + RETURN_IF_ERROR(dict_decoder->init()); + + std::unique_ptr dict_page_ptr(dict_decoder); + _dict_decoder = std::move(dict_page_ptr); + } + binary_dict_page_decoder->set_dict_decoder(_dict_decoder.get()); } } diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 43b53dc306ce40..37290a67f43358 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -28,7 +28,7 @@ #include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator #include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap #include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges -#include "binary_dict_page.h" +#include "page_handle.h" namespace doris { @@ -80,7 +80,7 @@ class ColumnReader { bool has_zone_map() { return _meta.has_zone_map_page(); } void get_row_ranges_by_zone_map(CondColumn* cond_column, RowRanges* row_ranges); - Status get_dict_page_decoder(BinaryDictPageDecoder* opts); + PagePointer get_dict_page_pointer(); private: Status _init_ordinal_index(); @@ -106,9 +106,6 @@ class ColumnReader { // column zone map info std::unique_ptr _column_zone_map; - - // keep dict page - std::shared_ptr _column_dict_page_decoder; }; // Base iterator to read one column data @@ -186,6 +183,12 @@ class FileColumnIterator : public ColumnIterator { // 3. When _page is null, it means that this reader can not be read. std::unique_ptr _page; + // keep dict page decoder + std::unique_ptr _dict_decoder; + + // keep dict page handle to avoid released + PageHandle _dict_page_handle; + // page iterator used to get next page when current page is finished. // This value will be reset when a new seek is issued OrdinalPageIndexIterator _page_iter; diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h index aea886b95dfd4a..9ee74ae2bc17f7 100644 --- a/be/src/olap/rowset/segment_v2/options.h +++ b/be/src/olap/rowset/segment_v2/options.h @@ -33,7 +33,6 @@ struct PageBuilderOptions { }; struct PageDecoderOptions { - std::shared_ptr dict_decoder = nullptr; }; } // namespace segment_v2 diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp index f918e70b22024e..fdb185d2188fbe 100644 --- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp @@ -53,8 +53,7 @@ class BinaryDictPageTest : public testing::Test { Status status = page_builder.get_dictionary_page(&dict_slice); ASSERT_TRUE(status.ok()); PageDecoderOptions dict_decoder_options; - std::shared_ptr dict_page_decoder( - new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); + BinaryPlainPageDecoder* dict_page_decoder = new BinaryPlainPageDecoder(dict_slice, dict_decoder_options); status = dict_page_decoder->init(); ASSERT_TRUE(status.ok()); // because every slice is unique @@ -62,8 +61,9 @@ class BinaryDictPageTest : public testing::Test { // decode PageDecoderOptions decoder_options; - decoder_options.dict_decoder = dict_page_decoder; BinaryDictPageDecoder page_decoder(s, decoder_options); + page_decoder.set_dict_decoder(dict_page_decoder); + status = page_decoder.init(); ASSERT_TRUE(status.ok()); ASSERT_EQ(slices.size(), page_decoder.count()); @@ -147,16 +147,15 @@ class BinaryDictPageTest : public testing::Test { int slice_index = random() % results.size(); //int slice_index = 1; PageDecoderOptions dict_decoder_options; - std::shared_ptr dict_page_decoder( - new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); + BinaryPlainPageDecoder* dict_page_decoder = new BinaryPlainPageDecoder(dict_slice, dict_decoder_options); status = dict_page_decoder->init(); ASSERT_TRUE(status.ok()); // decode PageDecoderOptions decoder_options; - decoder_options.dict_decoder = dict_page_decoder; BinaryDictPageDecoder page_decoder(results[slice_index], decoder_options); status = page_decoder.init(); + page_decoder.set_dict_decoder(dict_page_decoder); ASSERT_TRUE(status.ok()); //check values From 52195ad314f052f25eae1820dae09d1ee85eb65c Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Wed, 25 Sep 2019 10:01:43 +0800 Subject: [PATCH 06/15] update commit 0925 10 --- be/src/olap/rowset/segment_v2/binary_dict_page.h | 2 +- be/src/olap/rowset/segment_v2/column_reader.cpp | 9 +++------ be/src/olap/rowset/segment_v2/segment_writer.cpp | 3 --- .../rowset/segment_v2/binary_dict_page_test.cpp | 10 ++++++---- be/test/olap/rowset/segment_v2/segment_test.cpp | 15 ++++++++++----- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index e7d4ff78742abb..956a1a18b44421 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -124,7 +124,7 @@ class BinaryDictPageDecoder : public PageDecoder { Slice _data; PageDecoderOptions _options; std::unique_ptr _data_page_decoder; - BinaryPlainPageDecoder* _dict_decoder; + const BinaryPlainPageDecoder* _dict_decoder; bool _parsed; EncodingTypePB _encoding_type; faststring _code_buf; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index a3d2c595cdfb0a..4e886f273be1a7 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -32,7 +32,7 @@ #include "util/crc32c.h" #include "util/rle_encoding.h" // for RleDecoder #include "util/block_compression.h" -#include "binary_dict_page.h" +#include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder namespace doris { namespace segment_v2 { @@ -438,11 +438,8 @@ Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, Pars PagePointer pp = _reader->get_dict_page_pointer(); RETURN_IF_ERROR(_reader->read_page(pp, &_dict_page_handle)); - BinaryPlainPageDecoder* dict_decoder = new BinaryPlainPageDecoder(_dict_page_handle.data()); - RETURN_IF_ERROR(dict_decoder->init()); - - std::unique_ptr dict_page_ptr(dict_decoder); - _dict_decoder = std::move(dict_page_ptr); + _dict_decoder.reset(new BinaryPlainPageDecoder(_dict_page_handle.data())); + RETURN_IF_ERROR(_dict_decoder->init()); } binary_dict_page_decoder->set_dict_decoder(_dict_decoder.get()); } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 54a192e23a6df7..c4afd2a8527052 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -60,9 +60,6 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) { DCHECK(type_info != nullptr); ColumnWriterOptions opts; - if (column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_VARCHAR) { - opts.encoding_type = DICT_ENCODING; - } opts.compression_type = segment_v2::CompressionTypePB::LZ4F; // now we create zone map for key columns if (column.is_key()) { diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp index fdb185d2188fbe..7c26f3587bff6e 100644 --- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp @@ -53,7 +53,8 @@ class BinaryDictPageTest : public testing::Test { Status status = page_builder.get_dictionary_page(&dict_slice); ASSERT_TRUE(status.ok()); PageDecoderOptions dict_decoder_options; - BinaryPlainPageDecoder* dict_page_decoder = new BinaryPlainPageDecoder(dict_slice, dict_decoder_options); + std::unique_ptr dict_page_decoder( + new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); status = dict_page_decoder->init(); ASSERT_TRUE(status.ok()); // because every slice is unique @@ -62,7 +63,7 @@ class BinaryDictPageTest : public testing::Test { // decode PageDecoderOptions decoder_options; BinaryDictPageDecoder page_decoder(s, decoder_options); - page_decoder.set_dict_decoder(dict_page_decoder); + page_decoder.set_dict_decoder(dict_page_decoder.get()); status = page_decoder.init(); ASSERT_TRUE(status.ok()); @@ -147,7 +148,8 @@ class BinaryDictPageTest : public testing::Test { int slice_index = random() % results.size(); //int slice_index = 1; PageDecoderOptions dict_decoder_options; - BinaryPlainPageDecoder* dict_page_decoder = new BinaryPlainPageDecoder(dict_slice, dict_decoder_options); + std::unique_ptr dict_page_decoder( + new BinaryPlainPageDecoder(dict_slice, dict_decoder_options)); status = dict_page_decoder->init(); ASSERT_TRUE(status.ok()); @@ -155,7 +157,7 @@ class BinaryDictPageTest : public testing::Test { PageDecoderOptions decoder_options; BinaryDictPageDecoder page_decoder(results[slice_index], decoder_options); status = page_decoder.init(); - page_decoder.set_dict_decoder(dict_page_decoder); + page_decoder.set_dict_decoder(dict_page_decoder.get()); ASSERT_TRUE(status.ok()); //check values diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index bd81c2ba74c282..766488a6cac7b2 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -619,6 +619,7 @@ void set_column_value_by_type(FieldType fieldType, int src, char* target, size_t } else if (fieldType == OLAP_FIELD_TYPE_VARCHAR) { Slice* slice = new Slice(*new string(&std::to_string(src)[0])); std::memcpy(target, slice, sizeof(Slice)); + delete slice; } else { *(int*)target = src; } @@ -634,8 +635,8 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { tablet_schema->_num_rows_per_row_block = num_rows_per_block; tablet_schema->_cols.push_back(create_char_key(1)); tablet_schema->_cols.push_back(create_char_key(2)); - tablet_schema->_cols.push_back(create_char_key(3)); - tablet_schema->_cols.push_back(create_char_key(4)); + tablet_schema->_cols.push_back(create_varchar_key(3)); + tablet_schema->_cols.push_back(create_varchar_key(4)); // segment write std::string dname = "./ut_dir/segment_test"; @@ -663,8 +664,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { for (int j = 0; j < 4; ++j) { auto cell = row.cell(j); cell.set_not_null(); - Slice* slice = new Slice(*new string(&std::to_string(i * 10 + j)[0])); - std::memcpy(cell.mutable_cell_ptr(), slice, sizeof(Slice)); + set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[j]._length); } Status status = writer.append_row(row); ASSERT_TRUE(status.ok()); @@ -706,7 +706,12 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { int rid = rowid + i; ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); - ASSERT_EQ(&std::to_string(rid * 10 + cid)[0], actual->to_string()); + + char* expect = new char[sizeof(Slice)]; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, expect, tablet_schema->_cols[j]._length); + Slice* expect_ = reinterpret_cast(expect); + ASSERT_EQ(expect_->to_string(), actual->to_string()); + delete expect; } } rowid += rows_read; From 2d519fc79ab5b705dc0883acbb93c054f75f6df2 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Wed, 25 Sep 2019 19:49:43 +0800 Subject: [PATCH 07/15] update commit 0925 8 --- .../olap/rowset/segment_v2/column_reader.cpp | 2 +- be/src/olap/rowset/segment_v2/column_reader.h | 4 +-- .../olap/rowset/segment_v2/segment_test.cpp | 34 ++++++++++--------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 4e886f273be1a7..b440ffe90d4c1b 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -169,7 +169,7 @@ void ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column, _calculate_row_ranges(page_indexes, row_ranges); } -PagePointer ColumnReader::get_dict_page_pointer() { +PagePointer ColumnReader::get_dict_page_pointer() const { return _meta.dict_page(); } diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 3bb0fe75c55f81..af34db2903beef 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -29,7 +29,7 @@ #include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator #include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap #include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges -#include "page_handle.h" +#include "olap/rowset/segment_v2/page_handle.h" // for PageHandle namespace doris { @@ -86,7 +86,7 @@ class ColumnReader { void get_row_ranges_by_zone_map(CondColumn* cond_column, const std::vector& delete_conditions, RowRanges* row_ranges); - PagePointer get_dict_page_pointer(); + PagePointer get_dict_page_pointer() const; private: Status _init_ordinal_index(); diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index 766488a6cac7b2..890db937aa95f3 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -606,20 +606,24 @@ TEST_F(SegmentReaderWriterTest, TestDefaultValueColumn) { } } -void set_column_value_by_type(FieldType fieldType, int src, char* target, size_t _length = 0) { +void set_column_value_by_type(FieldType fieldType, int src, char* target, Arena* _arena, size_t _length = 0) { if (fieldType == OLAP_FIELD_TYPE_CHAR) { char* src_value = &std::to_string(src)[0]; int src_len = strlen(src_value); auto* dest_slice = (Slice*)target; dest_slice->size = _length; - dest_slice->data = new char[dest_slice->size]; + dest_slice->data = _arena->Allocate(dest_slice->size); memcpy(dest_slice->data, src_value, src_len); memset(dest_slice->data + src_len, 0, dest_slice->size - src_len); } else if (fieldType == OLAP_FIELD_TYPE_VARCHAR) { - Slice* slice = new Slice(*new string(&std::to_string(src)[0])); - std::memcpy(target, slice, sizeof(Slice)); - delete slice; + char* src_value = &std::to_string(src)[0]; + int src_len = strlen(src_value); + + auto* dest_slice = (Slice*)target; + dest_slice->size = src_len; + dest_slice->data = _arena->Allocate(src_len); + std::memcpy(dest_slice->data, src_value, src_len); } else { *(int*)target = src; } @@ -627,6 +631,7 @@ void set_column_value_by_type(FieldType fieldType, int src, char* target, size_t TEST_F(SegmentReaderWriterTest, TestStringDict) { size_t num_rows_per_block = 10; + Arena _arena; std::shared_ptr tablet_schema(new TabletSchema()); tablet_schema->_num_columns = 4; @@ -658,13 +663,12 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { // 0, 1, 2, 3 // 10, 11, 12, 13 // 20, 21, 22, 23 - // - // 64k int will generate 4 pages + // convert int to string for (int i = 0; i < 4096; ++i) { for (int j = 0; j < 4; ++j) { auto cell = row.cell(j); cell.set_not_null(); - set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[j]._length); + set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[j]._length); } Status status = writer.append_row(row); ASSERT_TRUE(status.ok()); @@ -707,11 +711,9 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); - char* expect = new char[sizeof(Slice)]; - set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, expect, tablet_schema->_cols[j]._length); - Slice* expect_ = reinterpret_cast(expect); - ASSERT_EQ(expect_->to_string(), actual->to_string()); - delete expect; + Slice expect; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast(&expect), &_arena, tablet_schema->_cols[j]._length); + ASSERT_EQ(expect.to_string(), actual->to_string()); } } rowid += rows_read; @@ -726,7 +728,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { { auto cell = lower_bound->cell(0); cell.set_not_null(); - set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 40970, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[0]._length); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 40970, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length); } StorageReadOptions read_opts; @@ -747,7 +749,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { { auto cell = lower_bound->cell(0); cell.set_not_null(); - set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -2, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[0]._length); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -2, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length); } std::unique_ptr upper_bound(new RowCursor()); @@ -755,7 +757,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { { auto cell = upper_bound->cell(0); cell.set_not_null(); - set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -1, (char*)cell.mutable_cell_ptr(), tablet_schema->_cols[0]._length); + set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -1, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length); } StorageReadOptions read_opts; From 349b251c5f9d098af9b01c28ffb601b1a331e02f Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Thu, 26 Sep 2019 13:27:34 +0800 Subject: [PATCH 08/15] update commit 0926:17 --- be/src/olap/olap_define.h | 5 +- .../rowset/segment_v2/binary_dict_page.cpp | 2 +- .../olap/rowset/segment_v2/binary_dict_page.h | 4 +- .../rowset/segment_v2/column_zone_map.cpp | 15 ++-- .../olap/rowset/segment_v2/column_zone_map.h | 1 - be/src/olap/types.h | 2 +- .../olap/rowset/segment_v2/segment_test.cpp | 72 +++++++++++++++++++ 7 files changed, 89 insertions(+), 12 deletions(-) diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h index 3735f1e2c8f104..685a9ef1b8ef1c 100644 --- a/be/src/olap/olap_define.h +++ b/be/src/olap/olap_define.h @@ -50,9 +50,12 @@ static const uint64_t OLAP_FIX_HEADER_MAGIC_NUMBER = 0; // 执行be/ce时默认的候选集大小 static constexpr uint32_t OLAP_COMPACTION_DEFAULT_CANDIDATE_SIZE = 10; -// the max length supported for string type +// the max length supported for varchar type static const uint16_t OLAP_STRING_MAX_LENGTH = 65535; +//the max length supported for char type +static const uint16_t OLAP_CHAR_MAX_LENGTH = 255; + static const int32_t PREFERRED_SNAPSHOT_VERSION = 3; // the max bytes for stored string length diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 2283396cd7dc43..6b6fbca4af5086 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -179,7 +179,7 @@ Status BinaryDictPageDecoder::seek_to_position_in_page(size_t pos) { return _data_page_decoder->seek_to_position_in_page(pos); } -bool BinaryDictPageDecoder::is_dict_encoding() { +bool BinaryDictPageDecoder::is_dict_encoding() const { return _encoding_type == DICT_ENCODING; } diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index 956a1a18b44421..cd72b1c2b41849 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -116,7 +116,7 @@ class BinaryDictPageDecoder : public PageDecoder { return _data_page_decoder->current_index(); } - bool is_dict_encoding(); + bool is_dict_encoding() const; void set_dict_decoder(PageDecoder* dict_decoder); @@ -124,7 +124,7 @@ class BinaryDictPageDecoder : public PageDecoder { Slice _data; PageDecoderOptions _options; std::unique_ptr _data_page_decoder; - const BinaryPlainPageDecoder* _dict_decoder; + const BinaryPlainPageDecoder* _dict_decoder = nullptr; bool _parsed; EncodingTypePB _encoding_type; faststring _code_buf; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp index 66aed086c8664b..8e61e52f7c9f76 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp @@ -28,7 +28,6 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in options.data_page_size = 0; _page_builder.reset(new BinaryPlainPageBuilder(options)); _field.reset(FieldFactory::create_by_type(_type_info->type())); - _max_string_value = _arena.Allocate(OLAP_STRING_MAX_LENGTH); _zone_map.min_value = _arena.Allocate(_type_info->size()); _zone_map.max_value = _arena.Allocate(_type_info->size()); _reset_zone_map(); @@ -79,11 +78,15 @@ Status ColumnZoneMapBuilder::flush() { void ColumnZoneMapBuilder::_reset_zone_map() { // we should allocate max varchar length and set to max for min value - Slice *min_slice = (Slice *)_zone_map.min_value; - min_slice->data = _max_string_value; - min_slice->size = OLAP_STRING_MAX_LENGTH; - Slice *max_slice = (Slice *)_zone_map.max_value; - max_slice->size = 0; + if (_type_info->type() == OLAP_FIELD_TYPE_VARCHAR) { + Slice *min_slice = (Slice *)_zone_map.min_value; + min_slice->data = _arena.Allocate(OLAP_STRING_MAX_LENGTH);; + min_slice->size = OLAP_STRING_MAX_LENGTH; + } else if (_type_info->type() == OLAP_FIELD_TYPE_CHAR) { + Slice *min_value = (Slice *)_zone_map.min_value; + min_value->data = _arena.Allocate(OLAP_CHAR_MAX_LENGTH);; + min_value->size = OLAP_CHAR_MAX_LENGTH; + } _field->set_to_max(_zone_map.min_value); _field->set_to_min(_zone_map.max_value); _zone_map.has_null = false; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/column_zone_map.h index 1fb01bc2d80955..3e2b7ac78f6a12 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.h +++ b/be/src/olap/rowset/segment_v2/column_zone_map.h @@ -73,7 +73,6 @@ class ColumnZoneMapBuilder { std::unique_ptr _field; // memory will be managed by arena ZoneMap _zone_map; - char* _max_string_value; Arena _arena; }; diff --git a/be/src/olap/types.h b/be/src/olap/types.h index 2b63035c512b01..ba84c588a31209 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -562,7 +562,7 @@ struct FieldTypeTraits : public BaseFieldtypeTraits(buf); - memset(slice->data, 0, slice->size); + slice->size = 0; } static uint32_t hash_code(const void* data, uint32_t seed) { auto slice = reinterpret_cast(data); diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index 890db937aa95f3..afaeb97f25ae67 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -770,6 +770,78 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { ASSERT_EQ(0, block.num_rows()); } + // test char zone_map query hit;should read whole page + { + TCondition condition; + condition.__set_column_name("1"); + condition.__set_condition_op(">"); + std::vector vals = {"100"}; + condition.__set_condition_values(vals); + std::shared_ptr conditions(new Conditions()); + conditions->set_tablet_schema(tablet_schema.get()); + conditions->append_condition(condition); + + StorageReadOptions read_opts; + read_opts.conditions = conditions.get(); + + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 1024); + int left = 4 * 1024; + int rowid = 0; + + while (left > 0) { + int rows_read = left > 1024 ? 1024 : left; + block.clear(); + st = iter->next_batch(&block); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(rows_read, block.num_rows()); + left -= rows_read; + + for (int j = 0; j < block.schema()->column_ids().size(); ++j) { + auto cid = block.schema()->column_ids()[j]; + auto column_block = block.column_block(j); + for (int i = 0; i < rows_read; ++i) { + int rid = rowid + i; + ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i)); + + const Slice* actual = reinterpret_cast(column_block.cell_ptr(i)); + Slice expect; + set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast(&expect), &_arena, tablet_schema->_cols[j]._length); + ASSERT_EQ(expect.to_string(), actual->to_string()) << "rid:" << rid << ", i:" << i;; + } + } + rowid += rows_read; + } + ASSERT_EQ(4 * 1024, rowid); + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + + // test char zone_map query miss;col < -1 + { + TCondition condition; + condition.__set_column_name("1"); + condition.__set_condition_op("<"); + std::vector vals = {"-1"}; + condition.__set_condition_values(vals); + std::shared_ptr conditions(new Conditions()); + conditions->set_tablet_schema(tablet_schema.get()); + conditions->append_condition(condition); + + StorageReadOptions read_opts; + read_opts.conditions = conditions.get(); + + std::unique_ptr iter = segment->new_iterator(schema, read_opts); + + RowBlockV2 block(schema, 1024); + + st = iter->next_batch(&block); + ASSERT_TRUE(st.is_end_of_file()); + ASSERT_EQ(0, block.num_rows()); + } + } FileUtils::remove_all(dname); From ec186d8ebf920c1e156d4ba8701824414b89934c Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Fri, 27 Sep 2019 15:22:38 +0800 Subject: [PATCH 09/15] re fresh commit From 118bce263679c7912006990c9388aec6bb5a7726 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Fri, 27 Sep 2019 16:10:21 +0800 Subject: [PATCH 10/15] reuse column zone map char array --- .../olap/rowset/segment_v2/column_zone_map.cpp | 16 +++++++++++++--- be/src/olap/rowset/segment_v2/column_zone_map.h | 2 ++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp index 8e61e52f7c9f76..93045d427c3779 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp @@ -30,6 +30,15 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in _field.reset(FieldFactory::create_by_type(_type_info->type())); _zone_map.min_value = _arena.Allocate(_type_info->size()); _zone_map.max_value = _arena.Allocate(_type_info->size()); + + if (_type_info->type() == OLAP_FIELD_TYPE_VARCHAR) { + _max_varchar_value = _arena.Allocate(OLAP_STRING_MAX_LENGTH); + memset(_max_varchar_value, 0xFF, OLAP_STRING_MAX_LENGTH); + } else if (_type_info->type() == OLAP_FIELD_TYPE_CHAR) { + _max_char_value = _arena.Allocate(OLAP_CHAR_MAX_LENGTH); + memset(_max_char_value, 0xFF, OLAP_CHAR_MAX_LENGTH); + } + _reset_zone_map(); } @@ -80,14 +89,15 @@ void ColumnZoneMapBuilder::_reset_zone_map() { // we should allocate max varchar length and set to max for min value if (_type_info->type() == OLAP_FIELD_TYPE_VARCHAR) { Slice *min_slice = (Slice *)_zone_map.min_value; - min_slice->data = _arena.Allocate(OLAP_STRING_MAX_LENGTH);; + min_slice->data = _max_varchar_value; min_slice->size = OLAP_STRING_MAX_LENGTH; } else if (_type_info->type() == OLAP_FIELD_TYPE_CHAR) { Slice *min_value = (Slice *)_zone_map.min_value; - min_value->data = _arena.Allocate(OLAP_CHAR_MAX_LENGTH);; + min_value->data = _max_char_value; min_value->size = OLAP_CHAR_MAX_LENGTH; + } else { + _field->set_to_max(_zone_map.min_value); } - _field->set_to_max(_zone_map.min_value); _field->set_to_min(_zone_map.max_value); _zone_map.has_null = false; _zone_map.has_not_null = false; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/column_zone_map.h index 3e2b7ac78f6a12..12e6f7948975f0 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.h +++ b/be/src/olap/rowset/segment_v2/column_zone_map.h @@ -73,6 +73,8 @@ class ColumnZoneMapBuilder { std::unique_ptr _field; // memory will be managed by arena ZoneMap _zone_map; + char* _max_char_value; + char* _max_varchar_value; Arena _arena; }; From c09aabc4976ee2db4e70fb893a218170bb026e76 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Fri, 27 Sep 2019 16:10:21 +0800 Subject: [PATCH 11/15] reuse column zone map char array --- .../olap/rowset/segment_v2/column_zone_map.cpp | 16 +++++++++++++--- be/src/olap/rowset/segment_v2/column_zone_map.h | 2 ++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp index 8e61e52f7c9f76..93045d427c3779 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp @@ -30,6 +30,15 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in _field.reset(FieldFactory::create_by_type(_type_info->type())); _zone_map.min_value = _arena.Allocate(_type_info->size()); _zone_map.max_value = _arena.Allocate(_type_info->size()); + + if (_type_info->type() == OLAP_FIELD_TYPE_VARCHAR) { + _max_varchar_value = _arena.Allocate(OLAP_STRING_MAX_LENGTH); + memset(_max_varchar_value, 0xFF, OLAP_STRING_MAX_LENGTH); + } else if (_type_info->type() == OLAP_FIELD_TYPE_CHAR) { + _max_char_value = _arena.Allocate(OLAP_CHAR_MAX_LENGTH); + memset(_max_char_value, 0xFF, OLAP_CHAR_MAX_LENGTH); + } + _reset_zone_map(); } @@ -80,14 +89,15 @@ void ColumnZoneMapBuilder::_reset_zone_map() { // we should allocate max varchar length and set to max for min value if (_type_info->type() == OLAP_FIELD_TYPE_VARCHAR) { Slice *min_slice = (Slice *)_zone_map.min_value; - min_slice->data = _arena.Allocate(OLAP_STRING_MAX_LENGTH);; + min_slice->data = _max_varchar_value; min_slice->size = OLAP_STRING_MAX_LENGTH; } else if (_type_info->type() == OLAP_FIELD_TYPE_CHAR) { Slice *min_value = (Slice *)_zone_map.min_value; - min_value->data = _arena.Allocate(OLAP_CHAR_MAX_LENGTH);; + min_value->data = _max_char_value; min_value->size = OLAP_CHAR_MAX_LENGTH; + } else { + _field->set_to_max(_zone_map.min_value); } - _field->set_to_max(_zone_map.min_value); _field->set_to_min(_zone_map.max_value); _zone_map.has_null = false; _zone_map.has_not_null = false; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/column_zone_map.h index 3e2b7ac78f6a12..12e6f7948975f0 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.h +++ b/be/src/olap/rowset/segment_v2/column_zone_map.h @@ -73,6 +73,8 @@ class ColumnZoneMapBuilder { std::unique_ptr _field; // memory will be managed by arena ZoneMap _zone_map; + char* _max_char_value; + char* _max_varchar_value; Arena _arena; }; From 24890453f6e76ff48c4acd432d3ce7329b576c9b Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Fri, 27 Sep 2019 20:19:23 +0800 Subject: [PATCH 12/15] add get_type_value_with_arena --- be/src/olap/field.h | 5 ++++ .../rowset/segment_v2/column_zone_map.cpp | 29 ++++--------------- be/src/olap/types.cpp | 1 + be/src/olap/types.h | 27 +++++++++++++++-- 4 files changed, 36 insertions(+), 26 deletions(-) diff --git a/be/src/olap/field.h b/be/src/olap/field.h index 32ee098a1e418d..dcf20de4514393 100644 --- a/be/src/olap/field.h +++ b/be/src/olap/field.h @@ -57,6 +57,7 @@ class Field { inline void set_to_max(char* buf) const { return _type_info->set_to_max(buf); } inline void set_to_min(char* buf) const { return _type_info->set_to_min(buf); } + inline char* get_type_value_with_arena(Arena* arena) const { return _type_info->get_type_value_with_arena(arena); } inline void agg_update(RowCursorCell* dest, const RowCursorCell& src, Arena* arena = nullptr) const { _agg_info->update(dest, src, arena); @@ -199,6 +200,10 @@ class Field { _type_info->deep_copy_with_arena(dest, src, arena); } + inline void direct_copy_content(char* dest, const char* src) const { + _type_info->direct_copy(dest, src); + } + // Copy srouce content to destination in index format. template void to_index(DstCellType* dst, const SrcCellType& src) const; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp index 93045d427c3779..f5f1d541801f03 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp @@ -28,16 +28,8 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in options.data_page_size = 0; _page_builder.reset(new BinaryPlainPageBuilder(options)); _field.reset(FieldFactory::create_by_type(_type_info->type())); - _zone_map.min_value = _arena.Allocate(_type_info->size()); - _zone_map.max_value = _arena.Allocate(_type_info->size()); - - if (_type_info->type() == OLAP_FIELD_TYPE_VARCHAR) { - _max_varchar_value = _arena.Allocate(OLAP_STRING_MAX_LENGTH); - memset(_max_varchar_value, 0xFF, OLAP_STRING_MAX_LENGTH); - } else if (_type_info->type() == OLAP_FIELD_TYPE_CHAR) { - _max_char_value = _arena.Allocate(OLAP_CHAR_MAX_LENGTH); - memset(_max_char_value, 0xFF, OLAP_CHAR_MAX_LENGTH); - } + _zone_map.min_value = _field->get_type_value_with_arena(&_arena); + _zone_map.max_value = _field->get_type_value_with_arena(&_arena); _reset_zone_map(); } @@ -46,10 +38,10 @@ Status ColumnZoneMapBuilder::add(const uint8_t *vals, size_t count) { if (vals != nullptr) { for (int i = 0; i < count; ++i) { if (_field->compare(_zone_map.min_value, (char *)vals) > 0) { - _field->deep_copy_content(_zone_map.min_value, (const char *)vals, &_arena); + _field->direct_copy_content(_zone_map.min_value, (const char *)vals); } if (_field->compare(_zone_map.max_value, (char *)vals) < 0) { - _field->deep_copy_content(_zone_map.max_value, (const char *)vals, &_arena); + _field->direct_copy_content(_zone_map.max_value, (const char *)vals); } vals += _type_info->size(); if (!_zone_map.has_not_null) { @@ -86,18 +78,7 @@ Status ColumnZoneMapBuilder::flush() { } void ColumnZoneMapBuilder::_reset_zone_map() { - // we should allocate max varchar length and set to max for min value - if (_type_info->type() == OLAP_FIELD_TYPE_VARCHAR) { - Slice *min_slice = (Slice *)_zone_map.min_value; - min_slice->data = _max_varchar_value; - min_slice->size = OLAP_STRING_MAX_LENGTH; - } else if (_type_info->type() == OLAP_FIELD_TYPE_CHAR) { - Slice *min_value = (Slice *)_zone_map.min_value; - min_value->data = _max_char_value; - min_value->size = OLAP_CHAR_MAX_LENGTH; - } else { - _field->set_to_max(_zone_map.min_value); - } + _field->set_to_max(_zone_map.min_value); _field->set_to_min(_zone_map.max_value); _zone_map.has_null = false; _zone_map.has_not_null = false; diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp index 87ef27b6430d25..2910c715fd564c 100644 --- a/be/src/olap/types.cpp +++ b/be/src/olap/types.cpp @@ -27,6 +27,7 @@ TypeInfo::TypeInfo(TypeTraitsClass t) _deep_copy(TypeTraitsClass::deep_copy), _deep_copy_with_arena(TypeTraitsClass::deep_copy_with_arena), _direct_copy(TypeTraitsClass::direct_copy), + _get_type_value_with_arena(TypeTraitsClass::get_type_value_with_arena), _from_string(TypeTraitsClass::from_string), _to_string(TypeTraitsClass::to_string), _set_to_max(TypeTraitsClass::set_to_max), diff --git a/be/src/olap/types.h b/be/src/olap/types.h index ba84c588a31209..2b75d6a4b18ee5 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -64,6 +64,10 @@ class TypeInfo { _direct_copy(dest, src); } + inline char* get_type_value_with_arena(Arena* arena) const { + return _get_type_value_with_arena(arena); + } + OLAPStatus from_string(void* buf, const std::string& scan_key) const { return _from_string(buf, scan_key); } @@ -85,6 +89,7 @@ class TypeInfo { void (*_deep_copy)(void* dest, const void* src, MemPool* mem_pool); void (*_deep_copy_with_arena)(void* dest, const void* src, Arena* arena); void (*_direct_copy)(void* dest, const void* src); + char* (*_get_type_value_with_arena)(Arena* arena); OLAPStatus (*_from_string)(void* buf, const std::string& scan_key); std::string (*_to_string)(const void* src); @@ -213,6 +218,10 @@ struct BaseFieldtypeTraits : public CppTypeTraits { return HashUtil::hash(data, sizeof(CppType), seed); } + static inline char* get_type_value_with_arena(Arena* arena) { + return arena->Allocate(sizeof(CppType)); + } + static std::string to_string(const void* src) { std::stringstream stream; stream << *reinterpret_cast(src); @@ -227,6 +236,7 @@ struct BaseFieldtypeTraits : public CppTypeTraits { *reinterpret_cast(buf) = value; return OLAP_SUCCESS; } + }; template @@ -568,6 +578,13 @@ struct FieldTypeTraits : public BaseFieldtypeTraits(data); return HashUtil::hash(slice->data, slice->size, seed); } + static char* get_type_value_with_arena(Arena* arena) { + char* type_value = arena->Allocate(sizeof(Slice)); + Slice* real_type_value = (Slice*)type_value; + real_type_value->size = OLAP_CHAR_MAX_LENGTH; + real_type_value->data = arena->Allocate(OLAP_CHAR_MAX_LENGTH); + return type_value; + } }; template<> @@ -587,13 +604,19 @@ struct FieldTypeTraits : public FieldTypeTraits(buf); - slice->size = 1; - memset(slice->data, 0xFF, 1); + memset(slice->data, 0xFF, slice->size); } static void set_to_min(void* buf) { auto slice = reinterpret_cast(buf); slice->size = 0; } + static char* get_type_value_with_arena(Arena* arena) { + char* type_value = arena->Allocate(sizeof(Slice)); + Slice* real_type_value = (Slice*)type_value; + real_type_value->size = OLAP_STRING_MAX_LENGTH; + real_type_value->data = arena->Allocate(OLAP_STRING_MAX_LENGTH); + return type_value; + } }; template<> From 429d3722a0108787fbee4bacda1ce77dd07b3c19 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Sun, 29 Sep 2019 20:33:44 +0800 Subject: [PATCH 13/15] 1. use allocate from 2. interpret not cast 3.remove useless var 4. set_to_min/max consist with master --- be/src/olap/field.h | 2 +- .../rowset/segment_v2/column_zone_map.cpp | 4 +-- .../olap/rowset/segment_v2/column_zone_map.h | 2 -- be/src/olap/types.cpp | 2 +- be/src/olap/types.h | 30 +++++++++---------- 5 files changed, 19 insertions(+), 21 deletions(-) diff --git a/be/src/olap/field.h b/be/src/olap/field.h index c4161a0f00492d..81215c97640488 100644 --- a/be/src/olap/field.h +++ b/be/src/olap/field.h @@ -57,7 +57,7 @@ class Field { inline void set_to_max(char* buf) const { return _type_info->set_to_max(buf); } inline void set_to_min(char* buf) const { return _type_info->set_to_min(buf); } - inline char* get_type_value_with_arena(Arena* arena) const { return _type_info->get_type_value_with_arena(arena); } + inline char* allocate_value_from_arena(Arena* arena) const { return _type_info->allocate_value_from_arena(arena); } inline void agg_update(RowCursorCell* dest, const RowCursorCell& src, MemPool* mem_pool = nullptr) const { _agg_info->update(dest, src, mem_pool); diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp index f5f1d541801f03..61dd3f25639810 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp @@ -28,8 +28,8 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in options.data_page_size = 0; _page_builder.reset(new BinaryPlainPageBuilder(options)); _field.reset(FieldFactory::create_by_type(_type_info->type())); - _zone_map.min_value = _field->get_type_value_with_arena(&_arena); - _zone_map.max_value = _field->get_type_value_with_arena(&_arena); + _zone_map.min_value = _field->allocate_value_from_arena(&_arena); + _zone_map.max_value = _field->allocate_value_from_arena(&_arena); _reset_zone_map(); } diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/column_zone_map.h index 12e6f7948975f0..3e2b7ac78f6a12 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.h +++ b/be/src/olap/rowset/segment_v2/column_zone_map.h @@ -73,8 +73,6 @@ class ColumnZoneMapBuilder { std::unique_ptr _field; // memory will be managed by arena ZoneMap _zone_map; - char* _max_char_value; - char* _max_varchar_value; Arena _arena; }; diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp index 2910c715fd564c..36704fcdd9d82d 100644 --- a/be/src/olap/types.cpp +++ b/be/src/olap/types.cpp @@ -27,7 +27,7 @@ TypeInfo::TypeInfo(TypeTraitsClass t) _deep_copy(TypeTraitsClass::deep_copy), _deep_copy_with_arena(TypeTraitsClass::deep_copy_with_arena), _direct_copy(TypeTraitsClass::direct_copy), - _get_type_value_with_arena(TypeTraitsClass::get_type_value_with_arena), + _allocate_value_from_arena(TypeTraitsClass::allocate_value_from_arena), _from_string(TypeTraitsClass::from_string), _to_string(TypeTraitsClass::to_string), _set_to_max(TypeTraitsClass::set_to_max), diff --git a/be/src/olap/types.h b/be/src/olap/types.h index 2b75d6a4b18ee5..bd381106a02c57 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -64,8 +64,8 @@ class TypeInfo { _direct_copy(dest, src); } - inline char* get_type_value_with_arena(Arena* arena) const { - return _get_type_value_with_arena(arena); + inline char* allocate_value_from_arena(Arena* arena) const { + return _allocate_value_from_arena(arena); } OLAPStatus from_string(void* buf, const std::string& scan_key) const { @@ -89,7 +89,7 @@ class TypeInfo { void (*_deep_copy)(void* dest, const void* src, MemPool* mem_pool); void (*_deep_copy_with_arena)(void* dest, const void* src, Arena* arena); void (*_direct_copy)(void* dest, const void* src); - char* (*_get_type_value_with_arena)(Arena* arena); + char* (*_allocate_value_from_arena)(Arena* arena); OLAPStatus (*_from_string)(void* buf, const std::string& scan_key); std::string (*_to_string)(const void* src); @@ -218,7 +218,7 @@ struct BaseFieldtypeTraits : public CppTypeTraits { return HashUtil::hash(data, sizeof(CppType), seed); } - static inline char* get_type_value_with_arena(Arena* arena) { + static inline char* allocate_value_from_arena(Arena* arena) { return arena->Allocate(sizeof(CppType)); } @@ -236,7 +236,6 @@ struct BaseFieldtypeTraits : public CppTypeTraits { *reinterpret_cast(buf) = value; return OLAP_SUCCESS; } - }; template @@ -572,17 +571,17 @@ struct FieldTypeTraits : public BaseFieldtypeTraits(buf); - slice->size = 0; + memset(slice->data, 0, slice->size); } static uint32_t hash_code(const void* data, uint32_t seed) { auto slice = reinterpret_cast(data); return HashUtil::hash(slice->data, slice->size, seed); } - static char* get_type_value_with_arena(Arena* arena) { + static char* allocate_value_from_arena(Arena* arena) { char* type_value = arena->Allocate(sizeof(Slice)); - Slice* real_type_value = (Slice*)type_value; - real_type_value->size = OLAP_CHAR_MAX_LENGTH; - real_type_value->data = arena->Allocate(OLAP_CHAR_MAX_LENGTH); + auto slice = reinterpret_cast(type_value); + slice->size = OLAP_CHAR_MAX_LENGTH; + slice->data = arena->Allocate(OLAP_CHAR_MAX_LENGTH); return type_value; } }; @@ -604,17 +603,18 @@ struct FieldTypeTraits : public FieldTypeTraits(buf); - memset(slice->data, 0xFF, slice->size); + slice->size = 1; + memset(slice->data, 0xFF, 1); } static void set_to_min(void* buf) { auto slice = reinterpret_cast(buf); slice->size = 0; } - static char* get_type_value_with_arena(Arena* arena) { + static char* allocate_value_from_arena(Arena* arena) { char* type_value = arena->Allocate(sizeof(Slice)); - Slice* real_type_value = (Slice*)type_value; - real_type_value->size = OLAP_STRING_MAX_LENGTH; - real_type_value->data = arena->Allocate(OLAP_STRING_MAX_LENGTH); + auto slice = reinterpret_cast(type_value); + slice->size = OLAP_STRING_MAX_LENGTH; + slice->data = arena->Allocate(OLAP_STRING_MAX_LENGTH); return type_value; } }; From 98fdf6f9a1df34f3064f8a9fd1a7d0081d19f382 Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Sun, 29 Sep 2019 22:49:43 +0800 Subject: [PATCH 14/15] refresh commit --- be/test/olap/rowset/segment_v2/segment_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index afaeb97f25ae67..01121ce872361c 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -824,7 +824,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { TCondition condition; condition.__set_column_name("1"); condition.__set_condition_op("<"); - std::vector vals = {"-1"}; + std::vector vals = {"-2"}; condition.__set_condition_values(vals); std::shared_ptr conditions(new Conditions()); conditions->set_tablet_schema(tablet_schema.get()); From e0d6ca088cfbb3f5aa1d33f174f0fc3d1259e5ac Mon Sep 17 00:00:00 2001 From: wangbo <506340561@qq.com> Date: Mon, 30 Sep 2019 13:02:48 +0800 Subject: [PATCH 15/15] add commit for lazy init dict --- be/src/olap/rowset/segment_v2/column_reader.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index b440ffe90d4c1b..86889c38ea4451 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -431,6 +431,10 @@ Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, Pars RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder)); RETURN_IF_ERROR(page->data_decoder->init()); + // lazy init dict_encoding'dict for three reasons + // 1. a column use dictionary encoding still has non-dict-encoded data pages are seeked,load dict when necessary + // 2. ColumnReader which is owned by Segment and Rowset can being alive even when there is no query,it should retain memory as small as possible. + // 3. Iterators of the same column won't repeat load the dict page because of page cache. if (_reader->encoding_info()->encoding() == DICT_ENCODING) { BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder; if (binary_dict_page_decoder->is_dict_encoding()) {