diff --git a/be/src/common/config.h b/be/src/common/config.h index cc900b5f53a79b..3d28350ab42465 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -246,12 +246,20 @@ CONF_Bool(enable_storage_vectorization, "true"); CONF_Bool(enable_low_cardinality_optimize, "true"); // be policy +// whether check compaction checksum +CONF_mBool(enable_compaction_checksum, "false"); // whether disable automatic compaction task CONF_mBool(disable_auto_compaction, "false"); // whether enable vectorized compaction CONF_Bool(enable_vectorized_compaction, "true"); // whether enable vectorized schema change/material-view/rollup task. CONF_Bool(enable_vectorized_alter_table, "true"); +// whether enable vertical compaction +CONF_mBool(enable_vertical_compaction, "false"); +// In vertical compaction, column number for every group +CONF_Int32(vertical_compaction_num_columns_per_group, "5"); +// In vertical compaction, max memory usage for row_source_buffer +CONF_Int32(vertical_compaction_max_row_source_memory_mb, "200"); // check the configuration of auto compaction in seconds when auto compaction disabled CONF_mInt32(check_auto_compaction_interval_seconds, "5"); diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 41b2ebd902a7c3..6d0425465cfbf6 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -22,6 +22,7 @@ #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_meta.h" #include "olap/tablet.h" +#include "olap/task/engine_checksum_task.h" #include "util/time.h" #include "util/trace.h" @@ -112,14 +113,50 @@ Status Compaction::quick_rowsets_compact() { Status Compaction::do_compaction(int64_t permits) { TRACE("start to do compaction"); + uint32_t checksum_before; + uint32_t checksum_after; + if (config::enable_compaction_checksum) { + EngineChecksumTask checksum_task(_tablet->tablet_id(), _tablet->schema_hash(), + _input_rowsets.back()->end_version(), &checksum_before); + checksum_task.execute(); + } + _tablet->data_dir()->disks_compaction_score_increment(permits); _tablet->data_dir()->disks_compaction_num_increment(1); Status st = do_compaction_impl(permits); _tablet->data_dir()->disks_compaction_score_increment(-permits); _tablet->data_dir()->disks_compaction_num_increment(-1); + + if (config::enable_compaction_checksum) { + EngineChecksumTask checksum_task(_tablet->tablet_id(), _tablet->schema_hash(), + _input_rowsets.back()->end_version(), &checksum_after); + checksum_task.execute(); + if (checksum_before != checksum_after) { + LOG(WARNING) << "Compaction tablet=" << _tablet->tablet_id() + << " checksum not consistent" + << ", before=" << checksum_before << ", checksum_after=" << checksum_after; + } + } return st; } +bool Compaction::should_vertical_compaction() { + // some conditions that not use vertical compaction + if (!config::enable_vertical_compaction) { + return false; + } + if (_tablet->enable_unique_key_merge_on_write()) { + return false; + } + return true; +} + +int64_t Compaction::get_avg_segment_rows() { + // take care of empty rowset + // todo(yixiu): add a new conf of segment size in compaction + return config::write_buffer_size / (_input_rowsets_size / (_input_row_num + 1) + 1); +} + Status Compaction::do_compaction_impl(int64_t permits) { OlapStopWatch watch; @@ -142,9 +179,11 @@ Status Compaction::do_compaction_impl(int64_t permits) { auto use_vectorized_compaction = config::enable_vectorized_compaction; string merge_type = use_vectorized_compaction ? "v" : ""; + bool vertical_compaction = should_vertical_compaction(); LOG(INFO) << "start " << merge_type << compaction_name() << ". tablet=" << _tablet->full_name() - << ", output_version=" << _output_version << ", permits: " << permits; + << ", output_version=" << _output_version << ", permits: " << permits + << ", is_vertical_compaction=" << vertical_compaction; // get cur schema if rowset schema exist, rowset schema must be newer than tablet schema std::vector rowset_metas(_input_rowsets.size()); std::transform(_input_rowsets.begin(), _input_rowsets.end(), rowset_metas.begin(), @@ -152,7 +191,7 @@ Status Compaction::do_compaction_impl(int64_t permits) { TabletSchemaSPtr cur_tablet_schema = _tablet->rowset_meta_with_max_schema_version(rowset_metas)->tablet_schema(); - RETURN_NOT_OK(construct_output_rowset_writer(cur_tablet_schema)); + RETURN_NOT_OK(construct_output_rowset_writer(cur_tablet_schema, vertical_compaction)); RETURN_NOT_OK(construct_input_rowset_readers()); TRACE("prepare finished"); @@ -166,8 +205,14 @@ Status Compaction::do_compaction_impl(int64_t permits) { } if (use_vectorized_compaction) { - res = Merger::vmerge_rowsets(_tablet, compaction_type(), cur_tablet_schema, - _input_rs_readers, _output_rs_writer.get(), &stats); + if (vertical_compaction) { + res = Merger::vertical_merge_rowsets(_tablet, compaction_type(), cur_tablet_schema, + _input_rs_readers, _output_rs_writer.get(), + get_avg_segment_rows(), &stats); + } else { + res = Merger::vmerge_rowsets(_tablet, compaction_type(), cur_tablet_schema, + _input_rs_readers, _output_rs_writer.get(), &stats); + } } else { res = Merger::merge_rowsets(_tablet, compaction_type(), cur_tablet_schema, _input_rs_readers, _output_rs_writer.get(), &stats); @@ -233,11 +278,15 @@ Status Compaction::do_compaction_impl(int64_t permits) { << "s. cumulative_compaction_policy=" << (cumu_policy == nullptr ? "quick" : cumu_policy->name()) << ", compact_row_per_second=" << int(_input_row_num / watch.get_elapse_second()); - return Status::OK(); } -Status Compaction::construct_output_rowset_writer(TabletSchemaSPtr schema) { +Status Compaction::construct_output_rowset_writer(TabletSchemaSPtr schema, bool is_vertical) { + if (is_vertical) { + return _tablet->create_vertical_rowset_writer(_output_version, VISIBLE, NONOVERLAPPING, + schema, _oldest_write_timestamp, + _newest_write_timestamp, &_output_rs_writer); + } return _tablet->create_rowset_writer(_output_version, VISIBLE, NONOVERLAPPING, schema, _oldest_write_timestamp, _newest_write_timestamp, &_output_rs_writer); diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index 26d985f9c007f1..e293273a3d0034 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -65,7 +65,7 @@ class Compaction { Status modify_rowsets(); void gc_output_rowset(); - Status construct_output_rowset_writer(TabletSchemaSPtr schema); + Status construct_output_rowset_writer(TabletSchemaSPtr schema, bool is_vertical); Status construct_input_rowset_readers(); Status check_version_continuity(const std::vector& rowsets); @@ -74,6 +74,9 @@ class Compaction { std::vector* missing_version); int64_t get_compaction_permits(); + bool should_vertical_compaction(); + int64_t get_avg_segment_rows(); + protected: // the root tracker for this compaction std::shared_ptr _mem_tracker; diff --git a/be/src/olap/iterators.h b/be/src/olap/iterators.h index 4f12118c2c2f4d..a7a5bcfbd48c06 100644 --- a/be/src/olap/iterators.h +++ b/be/src/olap/iterators.h @@ -33,6 +33,10 @@ class RowBlockV2; class Schema; class ColumnPredicate; +namespace vectorized { +struct IteratorRowRef; +}; + class StorageReadOptions { public: struct KeyRange { @@ -121,6 +125,13 @@ class RowwiseIterator { return Status::NotSupported("to be implemented"); } + virtual Status next_row(vectorized::IteratorRowRef* ref) { + return Status::NotSupported("to be implemented"); + } + virtual Status unique_key_next_row(vectorized::IteratorRowRef* ref) { + return Status::NotSupported("to be implemented"); + } + virtual bool support_return_data_by_ref() { return false; } virtual Status current_block_row_locations(std::vector* block_row_locations) { @@ -136,6 +147,9 @@ class RowwiseIterator { // Return the data id such as segment id, used for keep the insert order when do // merge sort in priority queue virtual uint64_t data_id() const { return 0; } + + // return rows merged count by iterator + virtual uint64_t merged_rows() const { return 0; } }; } // namespace doris diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index ace3d6b39a345a..52f9cd934ec543 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -26,6 +26,8 @@ #include "olap/tuple_reader.h" #include "util/trace.h" #include "vec/olap/block_reader.h" +#include "vec/olap/vertical_block_reader.h" +#include "vec/olap/vertical_merge_iterator.h" namespace doris { @@ -188,4 +190,137 @@ Status Merger::vmerge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, return Status::OK(); } +// split columns into several groups, make sure all keys in one group +// unique_key should consider sequence&delete column +void Merger::vertical_split_columns(TabletSchemaSPtr tablet_schema, + std::vector>* column_groups) { + uint32_t num_columns_per_group = config::vertical_compaction_num_columns_per_group; + uint32_t num_key_cols = tablet_schema->num_key_columns(); + uint32_t total_cols = tablet_schema->num_columns(); + std::vector key_columns; + for (auto i = 0; i < num_key_cols; ++i) { + key_columns.emplace_back(i); + } + // in unique key, sequence & delete sign column should merge with key columns + int32_t sequence_col_idx = -1; + int32_t delete_sign_idx = -1; + // in key column compaction, seq_col real index is _block->columns() -2 + // and delete_sign column is _block->columns() - 1 + if (tablet_schema->keys_type() == KeysType::UNIQUE_KEYS) { + if (tablet_schema->has_sequence_col()) { + sequence_col_idx = tablet_schema->sequence_col_idx(); + key_columns.emplace_back(sequence_col_idx); + } + delete_sign_idx = tablet_schema->field_index(DELETE_SIGN); + key_columns.emplace_back(delete_sign_idx); + } + VLOG_NOTICE << "sequence_col_idx=" << sequence_col_idx + << ", delete_sign_idx=" << delete_sign_idx; + column_groups->emplace_back(std::move(key_columns)); + std::vector value_columns; + for (auto i = num_key_cols; i < total_cols; ++i) { + if (i == sequence_col_idx || i == delete_sign_idx) { + continue; + } + if ((i - num_key_cols) % num_columns_per_group == 0) { + column_groups->emplace_back(); + } + column_groups->back().emplace_back(i); + } +} + +Status Merger::vertical_compact_one_group( + TabletSharedPtr tablet, ReaderType reader_type, TabletSchemaSPtr tablet_schema, bool is_key, + const std::vector& column_group, vectorized::RowSourcesBuffer* row_source_buf, + const std::vector& src_rowset_readers, + RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, Statistics* stats_output) { + // build tablet reader + VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment; + vectorized::VerticalBlockReader reader(row_source_buf); + TabletReader::ReaderParams reader_params; + reader_params.is_key_column_group = is_key; + reader_params.tablet = tablet; + reader_params.reader_type = reader_type; + reader_params.rs_readers = src_rowset_readers; + reader_params.version = dst_rowset_writer->version(); + { + std::shared_lock rdlock(tablet->get_header_lock()); + auto delete_preds = tablet->delete_predicates(); + std::copy(delete_preds.cbegin(), delete_preds.cend(), + std::inserter(reader_params.delete_predicates, + reader_params.delete_predicates.begin())); + } + TabletSchemaSPtr merge_tablet_schema = std::make_shared(); + merge_tablet_schema->copy_from(*tablet_schema); + // Merge the columns in delete predicate that not in latest schema in to current tablet schema + for (auto& del_pred_rs : reader_params.delete_predicates) { + merge_tablet_schema->merge_dropped_columns(tablet->tablet_schema(del_pred_rs->version())); + } + reader_params.tablet_schema = merge_tablet_schema; + + reader_params.return_columns = column_group; + reader_params.origin_return_columns = &reader_params.return_columns; + RETURN_NOT_OK(reader.init(reader_params)); + + vectorized::Block block = tablet_schema->create_block(reader_params.return_columns); + size_t output_rows = 0; + bool eof = false; + while (!eof) { + // Read one block from block reader + RETURN_NOT_OK_LOG( + reader.next_block_with_aggregation(&block, nullptr, nullptr, &eof), + "failed to read next block when merging rowsets of tablet " + tablet->full_name()); + RETURN_NOT_OK_LOG( + dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment), + "failed to write block when merging rowsets of tablet " + tablet->full_name()); + + output_rows += block.rows(); + block.clear_column_data(); + } + + if (is_key && stats_output != nullptr) { + stats_output->output_rows = output_rows; + stats_output->merged_rows = reader.merged_rows(); + stats_output->filtered_rows = reader.filtered_rows(); + } + RETURN_IF_ERROR(dst_rowset_writer->flush_columns()); + + return Status::OK(); +} + +// steps to do vertical merge: +// 1. split columns into column groups +// 2. compact groups one by one, generate a row_source_buf when compact key group +// and use this row_source_buf to compact value column groups +// 3. build output rowset +Status Merger::vertical_merge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, + TabletSchemaSPtr tablet_schema, + const std::vector& src_rowset_readers, + RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, + Statistics* stats_output) { + LOG(INFO) << "Start to do vertical compaction, tablet_id: " << tablet->tablet_id(); + std::vector> column_groups; + vertical_split_columns(tablet_schema, &column_groups); + + vectorized::RowSourcesBuffer row_sources_buf(tablet->tablet_id(), tablet->tablet_path(), + reader_type); + // compact group one by one + for (auto i = 0; i < column_groups.size(); ++i) { + VLOG_NOTICE << "row source size: " << row_sources_buf.total_size(); + bool is_key = (i == 0); + RETURN_IF_ERROR(vertical_compact_one_group( + tablet, reader_type, tablet_schema, is_key, column_groups[i], &row_sources_buf, + src_rowset_readers, dst_rowset_writer, max_rows_per_segment, stats_output)); + if (is_key) { + row_sources_buf.flush(); + } + row_sources_buf.seek_to_begin(); + } + // finish compact, build output rowset + VLOG_NOTICE << "finish compact groups"; + RETURN_IF_ERROR(dst_rowset_writer->final_flush()); + + return Status::OK(); +} + } // namespace doris diff --git a/be/src/olap/merger.h b/be/src/olap/merger.h index e0286e158d5378..aff9a741c47444 100644 --- a/be/src/olap/merger.h +++ b/be/src/olap/merger.h @@ -24,6 +24,10 @@ namespace doris { +namespace vectorized { +class RowSourcesBuffer; +}; + class Merger { public: struct Statistics { @@ -46,6 +50,23 @@ class Merger { TabletSchemaSPtr cur_tablet_schema, const std::vector& src_rowset_readers, RowsetWriter* dst_rowset_writer, Statistics* stats_output); + static Status vertical_merge_rowsets( + TabletSharedPtr tablet, ReaderType reader_type, TabletSchemaSPtr tablet_schema, + const std::vector& src_rowset_readers, + RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, + Statistics* stats_output); + +public: + // for vertical compaction + static void vertical_split_columns(TabletSchemaSPtr tablet_schema, + std::vector>* column_groups); + static Status vertical_compact_one_group( + TabletSharedPtr tablet, ReaderType reader_type, TabletSchemaSPtr tablet_schema, + bool is_key, const std::vector& column_group, + vectorized::RowSourcesBuffer* row_source_buf, + const std::vector& src_rowset_readers, + RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, + Statistics* stats_output); }; } // namespace doris diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index c757906d8d03f6..2bb7131822b2fc 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -221,6 +221,7 @@ Status TabletReader::_capture_rs_readers(const ReaderParams& read_params, _reader_context.delete_bitmap = read_params.delete_bitmap; _reader_context.enable_unique_key_merge_on_write = tablet()->enable_unique_key_merge_on_write(); _reader_context.record_rowids = read_params.record_rowids; + _reader_context.is_key_column_group = read_params.is_key_column_group; *valid_rs_readers = *rs_readers; diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h index 7c5dc32fcff00f..535bd9bda1e123 100644 --- a/be/src/olap/reader.h +++ b/be/src/olap/reader.h @@ -102,6 +102,9 @@ class TabletReader { // num of columns for orderby key size_t read_orderby_key_num_prefix_columns = 0; + // for vertical compaction + bool is_key_column_group = false; + void check_validation() const; std::string to_string() const; @@ -134,7 +137,7 @@ class TabletReader { return Status::OLAPInternalError(OLAP_ERR_READER_INITIALIZE_ERROR); } - uint64_t merged_rows() const { return _merged_rows; } + virtual uint64_t merged_rows() const { return _merged_rows; } uint64_t filtered_rows() const { return _stats.rows_del_filtered + _stats.rows_del_by_bitmap + diff --git a/be/src/olap/rowset/CMakeLists.txt b/be/src/olap/rowset/CMakeLists.txt index a701fa1005e43c..da6375a2d17a9b 100644 --- a/be/src/olap/rowset/CMakeLists.txt +++ b/be/src/olap/rowset/CMakeLists.txt @@ -29,4 +29,5 @@ add_library(Rowset STATIC beta_rowset.cpp beta_rowset_reader.cpp beta_rowset_writer.cpp + vertical_beta_rowset_writer.cpp rowset_tree.cpp) diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 87893927d58471..9744346d8f2318 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -36,7 +36,16 @@ BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset) _rowset->acquire(); } -Status BetaRowsetReader::init(RowsetReaderContext* read_context) { +void BetaRowsetReader::reset_read_options() { + _read_options.delete_condition_predicates = std::make_shared(); + _read_options.column_predicates.clear(); + _read_options.col_id_to_predicates.clear(); + _read_options.col_id_to_del_predicates.clear(); + _read_options.key_ranges.clear(); +} + +Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context, + std::vector* out_iters) { RETURN_NOT_OK(_rowset->load()); _context = read_context; if (_context->stats != nullptr) { @@ -47,30 +56,31 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context) { } // convert RowsetReaderContext to StorageReadOptions - StorageReadOptions read_options; - read_options.stats = _stats; - read_options.push_down_agg_type_opt = _context->push_down_agg_type_opt; + _read_options.stats = _stats; + _read_options.push_down_agg_type_opt = _context->push_down_agg_type_opt; if (read_context->lower_bound_keys != nullptr) { for (int i = 0; i < read_context->lower_bound_keys->size(); ++i) { - read_options.key_ranges.emplace_back(&read_context->lower_bound_keys->at(i), - read_context->is_lower_keys_included->at(i), - &read_context->upper_bound_keys->at(i), - read_context->is_upper_keys_included->at(i)); + _read_options.key_ranges.emplace_back(&read_context->lower_bound_keys->at(i), + read_context->is_lower_keys_included->at(i), + &read_context->upper_bound_keys->at(i), + read_context->is_upper_keys_included->at(i)); } } - bool can_reuse_schema = true; // delete_hanlder is always set, but it maybe not init, so that it will return empty conditions // or predicates when it is not inited. if (read_context->delete_handler != nullptr) { read_context->delete_handler->get_delete_conditions_after_version( - _rowset->end_version(), read_options.delete_condition_predicates.get(), - &read_options.col_id_to_del_predicates); + _rowset->end_version(), _read_options.delete_condition_predicates.get(), + &_read_options.col_id_to_del_predicates); // if del cond is not empty, schema may be different in multiple rowset - can_reuse_schema = read_options.col_id_to_del_predicates.empty(); + _can_reuse_schema = _read_options.col_id_to_del_predicates.empty(); } - - if (!can_reuse_schema || _context->reuse_input_schema == nullptr) { + // In vertical compaction, every column group need new schema + if (read_context->is_vertical_compaction) { + _can_reuse_schema = false; + } + if (!_can_reuse_schema || _context->reuse_input_schema == nullptr) { std::vector read_columns; std::set read_columns_set; std::set delete_columns_set; @@ -78,37 +88,37 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context) { read_columns.push_back(_context->return_columns->at(i)); read_columns_set.insert(_context->return_columns->at(i)); } - read_options.delete_condition_predicates->get_all_column_ids(delete_columns_set); + _read_options.delete_condition_predicates->get_all_column_ids(delete_columns_set); for (auto cid : delete_columns_set) { if (read_columns_set.find(cid) == read_columns_set.end()) { read_columns.push_back(cid); } } + VLOG_NOTICE << "read columns size: " << read_columns.size(); _input_schema = std::make_shared(_context->tablet_schema->columns(), read_columns); - - if (can_reuse_schema) { + if (_can_reuse_schema) { _context->reuse_input_schema = _input_schema; } } // if can reuse schema, context must have reuse_input_schema // if can't reuse schema, context mustn't have reuse_input_schema - DCHECK(can_reuse_schema ^ (_context->reuse_input_schema == nullptr)); + DCHECK(_can_reuse_schema ^ (_context->reuse_input_schema == nullptr)); if (_context->reuse_input_schema != nullptr && _input_schema == nullptr) { _input_schema = _context->reuse_input_schema; } if (read_context->predicates != nullptr) { - read_options.column_predicates.insert(read_options.column_predicates.end(), - read_context->predicates->begin(), - read_context->predicates->end()); + _read_options.column_predicates.insert(_read_options.column_predicates.end(), + read_context->predicates->begin(), + read_context->predicates->end()); for (auto pred : *(read_context->predicates)) { - if (read_options.col_id_to_predicates.count(pred->column_id()) < 1) { - read_options.col_id_to_predicates.insert( + if (_read_options.col_id_to_predicates.count(pred->column_id()) < 1) { + _read_options.col_id_to_predicates.insert( {pred->column_id(), std::make_shared()}); } auto single_column_block_predicate = new SingleColumnBlockPredicate(pred); - read_options.col_id_to_predicates[pred->column_id()]->add_column_predicate( + _read_options.col_id_to_predicates[pred->column_id()]->add_column_predicate( single_column_block_predicate); } } @@ -124,31 +134,31 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context) { } VLOG_TRACE << "Get the delete bitmap for rowset: " << rowset_id.to_string() << ", segment id:" << seg_id << ", size:" << d->cardinality(); - read_options.delete_bitmap.emplace(seg_id, std::move(d)); + _read_options.delete_bitmap.emplace(seg_id, std::move(d)); } } if (_should_push_down_value_predicates()) { if (read_context->value_predicates != nullptr) { - read_options.column_predicates.insert(read_options.column_predicates.end(), - read_context->value_predicates->begin(), - read_context->value_predicates->end()); + _read_options.column_predicates.insert(_read_options.column_predicates.end(), + read_context->value_predicates->begin(), + read_context->value_predicates->end()); for (auto pred : *(read_context->value_predicates)) { - if (read_options.col_id_to_predicates.count(pred->column_id()) < 1) { - read_options.col_id_to_predicates.insert( + if (_read_options.col_id_to_predicates.count(pred->column_id()) < 1) { + _read_options.col_id_to_predicates.insert( {pred->column_id(), std::make_shared()}); } auto single_column_block_predicate = new SingleColumnBlockPredicate(pred); - read_options.col_id_to_predicates[pred->column_id()]->add_column_predicate( + _read_options.col_id_to_predicates[pred->column_id()]->add_column_predicate( single_column_block_predicate); } } } - read_options.use_page_cache = read_context->use_page_cache; - read_options.tablet_schema = read_context->tablet_schema; - read_options.record_rowids = read_context->record_rowids; - read_options.read_orderby_key_reverse = read_context->read_orderby_key_reverse; - read_options.read_orderby_key_columns = read_context->read_orderby_key_columns; + _read_options.use_page_cache = read_context->use_page_cache; + _read_options.tablet_schema = read_context->tablet_schema; + _read_options.record_rowids = read_context->record_rowids; + _read_options.read_orderby_key_reverse = read_context->read_orderby_key_reverse; + _read_options.read_orderby_key_columns = read_context->read_orderby_key_columns; // load segments RETURN_NOT_OK(SegmentLoader::instance()->load_segments( @@ -159,7 +169,7 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context) { std::vector> seg_iterators; for (auto& seg_ptr : _segment_cache_handle.get_segments()) { std::unique_ptr iter; - auto s = seg_ptr->new_iterator(*_input_schema, read_options, &iter); + auto s = seg_ptr->new_iterator(*_input_schema, _read_options, &iter); if (!s.ok()) { LOG(WARNING) << "failed to create iterator[" << seg_ptr->id() << "]: " << s.to_string(); return Status::OLAPInternalError(OLAP_ERR_ROWSET_READER_INIT); @@ -167,11 +177,23 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context) { seg_iterators.push_back(std::move(iter)); } - std::vector iterators; for (auto& owned_it : seg_iterators) { + auto st = owned_it->init(_read_options); + if (!st.ok()) { + LOG(WARNING) << "failed to init iterator: " << st.to_string(); + return Status::OLAPInternalError(OLAP_ERR_ROWSET_READER_INIT); + } // transfer ownership of segment iterator to `_iterator` - iterators.push_back(owned_it.release()); + out_iters->push_back(owned_it.release()); } + return Status::OK(); +} + +Status BetaRowsetReader::init(RowsetReaderContext* read_context) { + RETURN_NOT_OK(_rowset->load()); + _context = read_context; + std::vector iterators; + RETURN_NOT_OK(get_segment_iterators(_context, &iterators)); // merge or union segment iterator RowwiseIterator* final_iterator; @@ -198,7 +220,7 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context) { } } - auto s = final_iterator->init(read_options); + auto s = final_iterator->init(_read_options); if (!s.ok()) { LOG(WARNING) << "failed to init iterator: " << s.to_string(); return Status::OLAPInternalError(OLAP_ERR_ROWSET_READER_INIT); @@ -216,7 +238,7 @@ Status BetaRowsetReader::init(RowsetReaderContext* read_context) { } // init input block - if (can_reuse_schema && !has_nestable_fields) { + if (_can_reuse_schema && !has_nestable_fields) { if (read_context->reuse_block == nullptr) { read_context->reuse_block.reset( new RowBlockV2(*_input_schema, std::min(1024, read_context->batch_size))); diff --git a/be/src/olap/rowset/beta_rowset_reader.h b/be/src/olap/rowset/beta_rowset_reader.h index 5424722c16a9ae..aaaf78b9b751e2 100644 --- a/be/src/olap/rowset/beta_rowset_reader.h +++ b/be/src/olap/rowset/beta_rowset_reader.h @@ -35,6 +35,10 @@ class BetaRowsetReader : public RowsetReader { Status init(RowsetReaderContext* read_context) override; + Status get_segment_iterators(RowsetReaderContext* read_context, + std::vector* out_iters) override; + void reset_read_options() override; + // It's ok, because we only get ref here, the block's owner is this reader. Status next_block(RowBlock** block) override; Status next_block(vectorized::Block* block) override; @@ -84,6 +88,9 @@ class BetaRowsetReader : public RowsetReader { // make sure this handle is initialized and valid before // reading data. SegmentCacheHandle _segment_cache_handle; + + StorageReadOptions _read_options; + bool _can_reuse_schema = true; }; } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 13532ee2690156..8b561b41571c0b 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -355,7 +355,7 @@ Status BetaRowsetWriter::_flush_segment_writer(std::unique_ptr l(_lock); _segment_num_rows.resize(_num_segment); _segments_encoded_key_bounds.resize(_num_segment); - _segment_num_rows[(*writer)->get_segment_id()] = (*writer)->num_rows_written(); + _segment_num_rows[(*writer)->get_segment_id()] = (*writer)->row_count(); _segments_encoded_key_bounds[(*writer)->get_segment_id()] = key_bounds; } writer->reset(); diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index 5980fae81ef992..de6688cb003625 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -85,7 +85,7 @@ class BetaRowsetWriter : public RowsetWriter { Status _flush_segment_writer(std::unique_ptr* writer); void _build_rowset_meta(std::shared_ptr rowset_meta); -private: +protected: RowsetWriterContext _context; std::shared_ptr _rowset_meta; diff --git a/be/src/olap/rowset/rowset_factory.cpp b/be/src/olap/rowset/rowset_factory.cpp index 9c8c75b2dc345d..5df3acf4f90a2a 100644 --- a/be/src/olap/rowset/rowset_factory.cpp +++ b/be/src/olap/rowset/rowset_factory.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/olap_file.pb.h" #include "olap/rowset/beta_rowset_writer.h" #include "olap/rowset/rowset_writer.h" +#include "olap/rowset/vertical_beta_rowset_writer.h" namespace doris { @@ -31,7 +32,8 @@ Status RowsetFactory::create_rowset(TabletSchemaSPtr schema, const std::string& if (rowset_meta->rowset_type() == ALPHA_ROWSET) { return Status::OLAPInternalError(OLAP_ERR_ROWSET_INVALID); } - if (rowset_meta->rowset_type() == BETA_ROWSET) { + if (rowset_meta->rowset_type() == BETA_ROWSET || + rowset_meta->rowset_type() == VERTICAL_BETA_ROWSET) { rowset->reset(new BetaRowset(schema, tablet_path, rowset_meta)); return (*rowset)->init(); } @@ -47,6 +49,10 @@ Status RowsetFactory::create_rowset_writer(const RowsetWriterContext& context, output->reset(new BetaRowsetWriter); return (*output)->init(context); } + if (context.rowset_type == VERTICAL_BETA_ROWSET) { + output->reset(new VerticalBetaRowsetWriter); + return (*output)->init(context); + } return Status::OLAPInternalError(OLAP_ERR_ROWSET_TYPE_NOT_FOUND); } diff --git a/be/src/olap/rowset/rowset_reader.h b/be/src/olap/rowset/rowset_reader.h index eecf594254cc9d..28ecf31f5f5970 100644 --- a/be/src/olap/rowset/rowset_reader.h +++ b/be/src/olap/rowset/rowset_reader.h @@ -22,6 +22,7 @@ #include #include "gen_cpp/olap_file.pb.h" +#include "olap/iterators.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_reader_context.h" #include "vec/core/block.h" @@ -43,6 +44,10 @@ class RowsetReader { // reader init virtual Status init(RowsetReaderContext* read_context) = 0; + virtual Status get_segment_iterators(RowsetReaderContext* read_context, + std::vector* out_iters) = 0; + virtual void reset_read_options() = 0; + // read next block data into *block. // Returns // OLAP_SUCCESS when read successfully. diff --git a/be/src/olap/rowset/rowset_reader_context.h b/be/src/olap/rowset/rowset_reader_context.h index ce2fd4b721a3d2..31b115ae3348ed 100644 --- a/be/src/olap/rowset/rowset_reader_context.h +++ b/be/src/olap/rowset/rowset_reader_context.h @@ -66,6 +66,8 @@ struct RowsetReaderContext { const DeleteBitmap* delete_bitmap = nullptr; bool record_rowids = false; std::shared_ptr reuse_block; + bool is_vertical_compaction = false; + bool is_key_column_group = false; std::shared_ptr reuse_input_schema; }; diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index 2713b3c60c419e..f5d095a48c571f 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -45,6 +45,10 @@ class RowsetWriter { virtual Status add_block(const vectorized::Block* block) { return Status::OLAPInternalError(OLAP_ERR_FUNC_NOT_IMPLEMENTED); } + virtual Status add_columns(const vectorized::Block* block, const std::vector& col_ids, + bool is_key, uint32_t max_rows_per_segment) { + return Status::OLAPInternalError(OLAP_ERR_FUNC_NOT_IMPLEMENTED); + } // Precondition: the input `rowset` should have the same type of the rowset we're building virtual Status add_rowset(RowsetSharedPtr rowset) = 0; @@ -55,6 +59,12 @@ class RowsetWriter { // explicit flush all buffered rows into segment file. // note that `add_row` could also trigger flush when certain conditions are met virtual Status flush() = 0; + virtual Status flush_columns() { + return Status::OLAPInternalError(OLAP_ERR_FUNC_NOT_IMPLEMENTED); + } + virtual Status final_flush() { + return Status::OLAPInternalError(OLAP_ERR_FUNC_NOT_IMPLEMENTED); + } virtual Status flush_single_memtable(MemTable* memtable, int64_t* flush_size) { return Status::OLAPInternalError(OLAP_ERR_FUNC_NOT_IMPLEMENTED); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 62b8ecc3b1ca4c..d588aa4b7fd128 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -49,8 +49,7 @@ SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, _opts(opts), _file_writer(file_writer), _mem_tracker(std::make_unique("SegmentWriter:Segment-" + - std::to_string(segment_id))), - _olap_data_convertor(tablet_schema.get()) { + std::to_string(segment_id))) { CHECK_NOTNULL(file_writer); if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { _num_key_columns = _tablet_schema->num_key_columns(); @@ -74,10 +73,9 @@ SegmentWriter::~SegmentWriter() { _mem_tracker->release(_mem_tracker->consumption()); } -void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t* column_id, +void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column, TabletSchemaSPtr tablet_schema) { - // TODO(zc): Do we need this column_id?? - meta->set_column_id((*column_id)++); + meta->set_column_id(column_id); meta->set_unique_id(column.unique_id()); meta->set_type(column.type()); meta->set_length(column.length()); @@ -91,13 +89,25 @@ void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t* column_id, } Status SegmentWriter::init() { - uint32_t column_id = 0; + std::vector column_ids; + for (uint32_t i = 0; i < _tablet_schema->num_columns(); ++i) { + column_ids.emplace_back(i); + } + return init(column_ids, true); +} + +Status SegmentWriter::init(const std::vector& col_ids, bool has_key) { + DCHECK(_column_writers.empty()); + DCHECK(_column_ids.empty()); + _has_key = has_key; _column_writers.reserve(_tablet_schema->columns().size()); - for (auto& column : _tablet_schema->columns()) { + _column_ids.insert(_column_ids.end(), col_ids.begin(), col_ids.end()); + for (auto& cid : col_ids) { + const auto& column = _tablet_schema->column(cid); ColumnWriterOptions opts; opts.meta = _footer.add_columns(); - init_column_meta(opts.meta, &column_id, column, _tablet_schema); + init_column_meta(opts.meta, cid, column, _tablet_schema); // now we create zone map for key columns in AGG_KEYS or all column in UNIQUE_KEYS or DUP_KEYS // and not support zone map for array type and jsonb type. @@ -130,18 +140,24 @@ Status SegmentWriter::init() { } // we don't need the short key index for unique key merge on write table. - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { - size_t seq_col_length = 0; - if (_tablet_schema->has_sequence_col()) { - seq_col_length = - _tablet_schema->column(_tablet_schema->sequence_col_idx()).length() + 1; + if (_has_key) { + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + size_t seq_col_length = 0; + if (_tablet_schema->has_sequence_col()) { + seq_col_length = + _tablet_schema->column(_tablet_schema->sequence_col_idx()).length() + 1; + } + _primary_key_index_builder.reset( + new PrimaryKeyIndexBuilder(_file_writer, seq_col_length)); + RETURN_IF_ERROR(_primary_key_index_builder->init()); + } else { + _short_key_index_builder.reset( + new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); } - _primary_key_index_builder.reset(new PrimaryKeyIndexBuilder(_file_writer, seq_col_length)); - RETURN_IF_ERROR(_primary_key_index_builder->init()); - } else { - _short_key_index_builder.reset( - new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); } + // init olap data converter + _olap_data_convertor = + std::make_unique(_tablet_schema.get(), _column_ids); return Status::OK(); } @@ -149,63 +165,70 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po size_t num_rows) { assert(block && num_rows > 0 && row_pos + num_rows <= block->rows() && block->columns() == _column_writers.size()); - _olap_data_convertor.set_source_content(block, row_pos, num_rows); + _olap_data_convertor->set_source_content(block, row_pos, num_rows); // find all row pos for short key indexes std::vector short_key_pos; - // We build a short key index every `_opts.num_rows_per_block` rows. Specifically, we - // build a short key index using 1st rows for first block and `_short_key_row_pos - _row_count` - // for next blocks. - // Ensure we build a short key index using 1st rows only for the first block (ISSUE-9766). - if (UNLIKELY(_short_key_row_pos == 0 && _row_count == 0)) { - short_key_pos.push_back(0); - } - while (_short_key_row_pos + _opts.num_rows_per_block < _row_count + num_rows) { - _short_key_row_pos += _opts.num_rows_per_block; - short_key_pos.push_back(_short_key_row_pos - _row_count); + if (_has_key) { + // We build a short key index every `_opts.num_rows_per_block` rows. Specifically, we + // build a short key index using 1st rows for first block and `_short_key_row_pos - _row_count` + // for next blocks. + // Ensure we build a short key index using 1st rows only for the first block (ISSUE-9766). + if (UNLIKELY(_short_key_row_pos == 0 && _num_rows_written == 0)) { + short_key_pos.push_back(0); + } + while (_short_key_row_pos + _opts.num_rows_per_block < _num_rows_written + num_rows) { + _short_key_row_pos += _opts.num_rows_per_block; + short_key_pos.push_back(_short_key_row_pos - _num_rows_written); + } } // convert column data from engine format to storage layer format std::vector key_columns; - for (size_t cid = 0; cid < _column_writers.size(); ++cid) { - auto converted_result = _olap_data_convertor.convert_column_data(cid); + for (size_t id = 0; id < _column_writers.size(); ++id) { + // olap data convertor alway start from id = 0 + auto converted_result = _olap_data_convertor->convert_column_data(id); if (converted_result.first != Status::OK()) { return converted_result.first; } - if (cid < _num_key_columns || - (_tablet_schema->has_sequence_col() && _tablet_schema->keys_type() == UNIQUE_KEYS && - _opts.enable_unique_key_merge_on_write && cid == _tablet_schema->sequence_col_idx())) { + auto cid = _column_ids[id]; + if (_has_key && (cid < _num_key_columns || (_tablet_schema->has_sequence_col() && + _tablet_schema->keys_type() == UNIQUE_KEYS && + _opts.enable_unique_key_merge_on_write && + cid == _tablet_schema->sequence_col_idx()))) { key_columns.push_back(converted_result.second); } - RETURN_IF_ERROR(_column_writers[cid]->append(converted_result.second->get_nullmap(), - converted_result.second->get_data(), - num_rows)); + RETURN_IF_ERROR(_column_writers[id]->append(converted_result.second->get_nullmap(), + converted_result.second->get_data(), num_rows)); } - - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { - // create primary indexes - for (size_t pos = 0; pos < num_rows; pos++) { - RETURN_IF_ERROR(_primary_key_index_builder->add_item(_encode_keys(key_columns, pos))); - } - } else { - // create short key indexes - for (const auto pos : short_key_pos) { - RETURN_IF_ERROR(_short_key_index_builder->add_item(_encode_keys(key_columns, pos))); + if (_has_key) { + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + // create primary indexes + for (size_t pos = 0; pos < num_rows; pos++) { + RETURN_IF_ERROR( + _primary_key_index_builder->add_item(_encode_keys(key_columns, pos))); + } + } else { + // create short key indexes + for (const auto pos : short_key_pos) { + RETURN_IF_ERROR(_short_key_index_builder->add_item(_encode_keys(key_columns, pos))); + } } } - _row_count += num_rows; - _olap_data_convertor.clear_source_content(); + _num_rows_written += num_rows; + _olap_data_convertor->clear_source_content(); return Status::OK(); } int64_t SegmentWriter::max_row_to_add(size_t row_avg_size_in_bytes) { auto segment_size = estimate_segment_size(); - if (PREDICT_FALSE(segment_size >= MAX_SEGMENT_SIZE || _row_count >= _max_row_per_segment)) { + if (PREDICT_FALSE(segment_size >= MAX_SEGMENT_SIZE || + _num_rows_written >= _max_row_per_segment)) { return 0; } int64_t size_rows = ((int64_t)MAX_SEGMENT_SIZE - (int64_t)segment_size) / row_avg_size_in_bytes; - int64_t count_rows = (int64_t)_max_row_per_segment - _row_count; + int64_t count_rows = (int64_t)_max_row_per_segment - _num_rows_written; return std::min(size_rows, count_rows); } @@ -265,13 +288,13 @@ Status SegmentWriter::append_row(const RowType& row) { RETURN_IF_ERROR(_primary_key_index_builder->add_item(encoded_key)); } else { // At the beginning of one block, so add a short key index entry - if ((_row_count % _opts.num_rows_per_block) == 0) { + if ((_num_rows_written % _opts.num_rows_per_block) == 0) { std::string encoded_key; encode_key(&encoded_key, row, _num_key_columns); RETURN_IF_ERROR(_short_key_index_builder->add_item(encoded_key)); } } - ++_row_count; + ++_num_rows_written; return Status::OK(); } @@ -299,11 +322,12 @@ uint64_t SegmentWriter::estimate_segment_size() { return size; } -Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size) { - // check disk capacity - if (_data_dir != nullptr && _data_dir->reach_capacity_limit((int64_t)estimate_segment_size())) { - return Status::InternalError("disk {} exceed capacity limit.", _data_dir->path_hash()); +Status SegmentWriter::finalize_columns(uint64_t* index_size) { + if (_has_key) { + _row_count = _num_rows_written; } + _num_rows_written = 0; + for (auto& column_writer : _column_writers) { RETURN_IF_ERROR(column_writer->finish()); } @@ -313,18 +337,50 @@ Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size RETURN_IF_ERROR(_write_zone_map()); RETURN_IF_ERROR(_write_bitmap_index()); RETURN_IF_ERROR(_write_bloom_filter_index()); - if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { - RETURN_IF_ERROR(_write_primary_key_index()); - } else { - RETURN_IF_ERROR(_write_short_key_index()); - } + *index_size = _file_writer->bytes_appended() - index_offset; + if (_has_key) { + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + RETURN_IF_ERROR(_write_primary_key_index()); + } else { + RETURN_IF_ERROR(_write_short_key_index()); + } + *index_size = _file_writer->bytes_appended() - index_offset; + } + // reset all column writers and data_conveter + _reset_column_writers(); + _column_ids.clear(); + _olap_data_convertor.reset(); + return Status::OK(); +} + +Status SegmentWriter::finalize_footer(uint64_t* segment_file_size) { RETURN_IF_ERROR(_write_footer()); RETURN_IF_ERROR(_file_writer->finalize()); *segment_file_size = _file_writer->bytes_appended(); return Status::OK(); } +Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size) { + // check disk capacity + if (_data_dir != nullptr && _data_dir->reach_capacity_limit((int64_t)estimate_segment_size())) { + return Status::InternalError("disk {} exceed capacity limit.", _data_dir->path_hash()); + } + + RETURN_IF_ERROR(finalize_columns(index_size)); + + // writer footer + RETURN_IF_ERROR(finalize_footer(segment_file_size)); + return Status::OK(); +} + +void SegmentWriter::_reset_column_writers() { + for (auto& column_writer : _column_writers) { + column_writer.reset(); + } + _column_writers.clear(); +} + // write column data to file one by one Status SegmentWriter::_write_data() { for (auto& column_writer : _column_writers) { diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index fd54dec72df606..5928ff4ddeb316 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -69,6 +69,9 @@ class SegmentWriter { Status init(); + // for vertical compaction + Status init(const std::vector& col_ids, bool has_key); + template Status append_row(const RowType& row); @@ -78,13 +81,18 @@ class SegmentWriter { uint64_t estimate_segment_size(); - uint32_t num_rows_written() const { return _row_count; } + uint32_t num_rows_written() const { return _num_rows_written; } + uint32_t row_count() const { return _row_count; } Status finalize(uint64_t* segment_file_size, uint64_t* index_size); - static void init_column_meta(ColumnMetaPB* meta, uint32_t* column_id, - const TabletColumn& column, TabletSchemaSPtr tablet_schema); uint32_t get_segment_id() { return _segment_id; } + + Status finalize_columns(uint64_t* index_size); + Status finalize_footer(uint64_t* segment_file_size); + + static void init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column, + TabletSchemaSPtr tablet_schema); Slice min_encoded_key(); Slice max_encoded_key(); @@ -102,6 +110,8 @@ class SegmentWriter { std::string _encode_keys(const std::vector& key_columns, size_t pos, bool null_first = true); + void _reset_column_writers(); + private: uint32_t _segment_id; TabletSchemaSPtr _tablet_schema; @@ -120,11 +130,16 @@ class SegmentWriter { std::unique_ptr _mem_tracker; uint32_t _row_count = 0; - vectorized::OlapBlockDataConvertor _olap_data_convertor; + std::unique_ptr _olap_data_convertor; // used for building short key index or primary key index during vectorized write. std::vector _key_coders; std::vector _key_index_size; size_t _short_key_row_pos = 0; + + std::vector _column_ids; + bool _has_key = true; + // written when add particial columns + uint32_t _num_rows_written = 0; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp new file mode 100644 index 00000000000000..ff369410c3d2df --- /dev/null +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/vertical_beta_rowset_writer.h" + +#include "olap/rowset/beta_rowset.h" + +namespace doris { + +VerticalBetaRowsetWriter::~VerticalBetaRowsetWriter() { + if (!_already_built) { + auto fs = _rowset_meta->fs(); + if (!fs) { + return; + } + for (auto& segment_writer : _segment_writers) { + segment_writer.reset(); + } + for (int i = 0; i < _num_segment; ++i) { + auto path = BetaRowset::local_segment_path(_context.tablet_path, _context.rowset_id, i); + // Even if an error is encountered, these files that have not been cleaned up + // will be cleaned up by the GC background. So here we only print the error + // message when we encounter an error. + WARN_IF_ERROR(fs->delete_file(path), + strings::Substitute("Failed to delete file=$0", path)); + } + } +} + +Status VerticalBetaRowsetWriter::add_columns(const vectorized::Block* block, + const std::vector& col_ids, bool is_key, + uint32_t max_rows_per_segment) { + VLOG_NOTICE << "VerticalBetaRowsetWriter::add_columns, columns: " << block->columns(); + size_t num_rows = block->rows(); + if (num_rows == 0) { + return Status::OK(); + } + if (_segment_writers.empty()) { + // it must be key columns + DCHECK(is_key); + std::unique_ptr writer; + RETURN_IF_ERROR(_create_segment_writer(col_ids, is_key, &writer)); + _segment_writers.emplace_back(std::move(writer)); + _cur_writer_idx = 0; + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, 0, num_rows)); + } else if (is_key) { + if (_segment_writers[_cur_writer_idx]->num_rows_written() > max_rows_per_segment) { + // segment is full, need flush columns and create new segment writer + RETURN_IF_ERROR(_flush_columns(&_segment_writers[_cur_writer_idx])); + std::unique_ptr writer; + RETURN_IF_ERROR(_create_segment_writer(col_ids, is_key, &writer)); + _segment_writers.emplace_back(std::move(writer)); + ++_cur_writer_idx; + } + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, 0, num_rows)); + } else { + // value columns + uint32_t num_rows_written = _segment_writers[_cur_writer_idx]->num_rows_written(); + VLOG_NOTICE << "num_rows_written: " << num_rows_written + << ", _cur_writer_idx: " << _cur_writer_idx; + // init if it's first value column write in current segment + if (_cur_writer_idx == 0 && num_rows_written == 0) { + VLOG_NOTICE << "init first value column segment writer"; + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); + } + if (num_rows_written > max_rows_per_segment) { + RETURN_IF_ERROR(_flush_columns(&_segment_writers[_cur_writer_idx])); + // switch to next writer + ++_cur_writer_idx; + VLOG_NOTICE << "init next value column segment writer: " << _cur_writer_idx; + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); + } + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, 0, num_rows)); + } + if (is_key) { + _num_rows_written += num_rows; + } + return Status::OK(); +} + +Status VerticalBetaRowsetWriter::_flush_columns( + std::unique_ptr* segment_writer) { + uint64_t index_size = 0; + VLOG_NOTICE << "flush columns index: " << _cur_writer_idx; + RETURN_IF_ERROR((*segment_writer)->finalize_columns(&index_size)); + _total_index_size += static_cast(index_size); + return Status::OK(); +} + +Status VerticalBetaRowsetWriter::flush_columns() { + if (_segment_writers.empty()) { + return Status::OK(); + } + + DCHECK(_segment_writers[_cur_writer_idx]); + RETURN_IF_ERROR(_flush_columns(&_segment_writers[_cur_writer_idx])); + _cur_writer_idx = 0; + return Status::OK(); +} + +Status VerticalBetaRowsetWriter::_create_segment_writer( + const std::vector& column_ids, bool is_key, + std::unique_ptr* writer) { + auto path = BetaRowset::local_segment_path(_context.tablet_path, _context.rowset_id, + _num_segment++); + auto fs = _rowset_meta->fs(); + if (!fs) { + return Status::OLAPInternalError(OLAP_ERR_INIT_FAILED); + } + io::FileWriterPtr file_writer; + Status st = fs->create_file(path, &file_writer); + if (!st.ok()) { + LOG(WARNING) << "failed to create writable file. path=" << path + << ", err: " << st.get_error_msg(); + return st; + } + + DCHECK(file_writer != nullptr); + segment_v2::SegmentWriterOptions writer_options; + writer_options.enable_unique_key_merge_on_write = _context.enable_unique_key_merge_on_write; + writer->reset(new segment_v2::SegmentWriter(file_writer.get(), _num_segment, + _context.tablet_schema, _context.data_dir, + _context.max_rows_per_segment, writer_options)); + { + std::lock_guard l(_lock); + _file_writers.push_back(std::move(file_writer)); + } + + auto s = (*writer)->init(column_ids, is_key); + if (!s.ok()) { + LOG(WARNING) << "failed to init segment writer: " << s.to_string(); + writer->reset(nullptr); + return s; + } + return Status::OK(); +} + +Status VerticalBetaRowsetWriter::final_flush() { + for (auto& segment_writer : _segment_writers) { + uint64_t segment_size = 0; + //uint64_t footer_position = 0; + auto st = segment_writer->finalize_footer(&segment_size); + if (!st.ok()) { + LOG(WARNING) << "Fail to finalize segment footer, " << st; + return st; + } + _total_data_size += segment_size; + segment_writer.reset(); + } + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.h b/be/src/olap/rowset/vertical_beta_rowset_writer.h new file mode 100644 index 00000000000000..e935a89ce759ab --- /dev/null +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/beta_rowset_writer.h" +#include "olap/rowset/segment_v2/segment_writer.h" + +namespace doris { + +// for vertical compaction +class VerticalBetaRowsetWriter : public BetaRowsetWriter { +public: + VerticalBetaRowsetWriter() : BetaRowsetWriter() {} + ~VerticalBetaRowsetWriter(); + + Status add_columns(const vectorized::Block* block, const std::vector& col_ids, + bool is_key, uint32_t max_rows_per_segment); + + // flush last segment's column + Status flush_columns(); + + // flush when all column finished, flush column footer + Status final_flush(); + +private: + // only key group will create segment writer + Status _create_segment_writer(const std::vector& column_ids, bool is_key, + std::unique_ptr* writer); + + Status _flush_columns(std::unique_ptr* segment_writer); + +private: + std::vector> _segment_writers; + size_t _cur_writer_idx = 0; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 1cb03ee2060286..128786c2ca6163 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -1656,6 +1656,23 @@ Status Tablet::create_initial_rowset(const int64_t req_version) { return res; } +Status Tablet::create_vertical_rowset_writer( + const Version& version, const RowsetStatePB& rowset_state, const SegmentsOverlapPB& overlap, + TabletSchemaSPtr tablet_schema, int64_t oldest_write_timestamp, + int64_t newest_write_timestamp, std::unique_ptr* rowset_writer) { + RowsetWriterContext context; + context.version = version; + context.rowset_state = rowset_state; + context.segments_overlap = overlap; + context.oldest_write_timestamp = oldest_write_timestamp; + context.newest_write_timestamp = newest_write_timestamp; + context.tablet_schema = tablet_schema; + context.enable_unique_key_merge_on_write = enable_unique_key_merge_on_write(); + _init_context_common_fields(context); + context.rowset_type = VERTICAL_BETA_ROWSET; + return RowsetFactory::create_rowset_writer(context, rowset_writer); +} + Status Tablet::create_rowset_writer(const Version& version, const RowsetStatePB& rowset_state, const SegmentsOverlapPB& overlap, TabletSchemaSPtr tablet_schema, int64_t oldest_write_timestamp, diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 938c1f38a216c9..57428e161b2ef7 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -306,6 +306,13 @@ class Tablet : public BaseTablet { TabletSchemaSPtr tablet_schema, std::unique_ptr* rowset_writer); + Status create_vertical_rowset_writer(const Version& version, const RowsetStatePB& rowset_state, + const SegmentsOverlapPB& overlap, + TabletSchemaSPtr tablet_schema, + int64_t oldest_write_timestamp, + int64_t newest_write_timestamp, + std::unique_ptr* rowset_writer); + Status create_rowset(RowsetMetaSharedPtr rowset_meta, RowsetSharedPtr* rowset); // Cooldown to remote fs. Status cooldown(); diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp index b6f25bb2a40971..fcd49ec7819e13 100644 --- a/be/src/olap/task/engine_checksum_task.cpp +++ b/be/src/olap/task/engine_checksum_task.cpp @@ -53,6 +53,7 @@ Status EngineChecksumTask::_compute_checksum() { TupleReader reader; TabletReader::ReaderParams reader_params; reader_params.tablet = tablet; + reader_params.tablet_schema = tablet->tablet_schema(); reader_params.reader_type = READER_CHECKSUM; reader_params.version = Version(0, _version); diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index c19f71e4c6c459..a4893c93562d8d 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -210,6 +210,8 @@ set(VEC_FILES olap/vcollect_iterator.cpp olap/block_reader.cpp olap/olap_data_convertor.cpp + olap/vertical_merge_iterator.cpp + olap/vertical_block_reader.cpp sink/vmysql_result_writer.cpp sink/vresult_sink.cpp sink/vdata_stream_sender.cpp diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index 33598c4e982e5e..85e353fb4d0987 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -33,6 +33,15 @@ OlapBlockDataConvertor::OlapBlockDataConvertor(const TabletSchema* tablet_schema } } +OlapBlockDataConvertor::OlapBlockDataConvertor(const TabletSchema* tablet_schema, + const std::vector& col_ids) { + assert(tablet_schema); + for (const auto& id : col_ids) { + const auto& col = tablet_schema->column(id); + _convertors.emplace_back(create_olap_column_data_convertor(col)); + } +} + OlapBlockDataConvertor::OlapColumnDataConvertorBaseUPtr OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& column) { switch (column.type()) { diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index 9eb63c91541132..e2249507fa8a31 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -48,6 +48,7 @@ class IOlapColumnDataAccessor { class OlapBlockDataConvertor { public: OlapBlockDataConvertor(const TabletSchema* tablet_schema); + OlapBlockDataConvertor(const TabletSchema* tablet_schema, const std::vector& col_ids); void set_source_content(const vectorized::Block* block, size_t row_pos, size_t num_rows); void clear_source_content(); std::pair convert_column_data(size_t cid); diff --git a/be/src/vec/olap/vertical_block_reader.cpp b/be/src/vec/olap/vertical_block_reader.cpp new file mode 100644 index 00000000000000..5d20c97f969867 --- /dev/null +++ b/be/src/vec/olap/vertical_block_reader.cpp @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/olap/vertical_block_reader.h" + +#include "common/status.h" +#include "olap/like_column_predicate.h" +#include "olap/olap_common.h" +#include "runtime/mem_pool.h" +#include "vec/aggregate_functions/aggregate_function_reader.h" +#include "vec/olap/block_reader.h" +#include "vec/olap/vcollect_iterator.h" +#include "vec/olap/vertical_merge_iterator.h" + +namespace doris::vectorized { + +VerticalBlockReader::~VerticalBlockReader() { + for (int i = 0; i < _agg_functions.size(); ++i) { + _agg_functions[i]->destroy(_agg_places[i]); + delete[] _agg_places[i]; + } +} + +Status VerticalBlockReader::_get_segment_iterators(const ReaderParams& read_params, + std::vector* segment_iters) { + std::vector rs_readers; + auto res = _capture_rs_readers(read_params, &rs_readers); + if (!res.ok()) { + LOG(WARNING) << "fail to init reader when _capture_rs_readers. res:" << res + << ", tablet_id:" << read_params.tablet->tablet_id() + << ", schema_hash:" << read_params.tablet->schema_hash() + << ", reader_type:" << read_params.reader_type + << ", version:" << read_params.version; + return res; + } + _reader_context.batch_size = _batch_size; + _reader_context.is_vec = true; + _reader_context.is_vertical_compaction = true; + for (auto& rs_reader : rs_readers) { + // segment iterator will be inited here + RETURN_NOT_OK(rs_reader->get_segment_iterators(&_reader_context, segment_iters)); + rs_reader->reset_read_options(); + } + return Status::OK(); +} + +Status VerticalBlockReader::_init_collect_iter(const ReaderParams& read_params) { + // get segment iterators + std::vector segment_iters; + RETURN_IF_ERROR(_get_segment_iterators(read_params, &segment_iters)); + + // build heap if key column iterator or build vertical merge iterator if value column + if (read_params.is_key_column_group) { + uint32_t seq_col_idx = -1; + if (read_params.tablet->tablet_schema()->has_sequence_col()) { + seq_col_idx = read_params.tablet->tablet_schema()->sequence_col_idx(); + } + _vcollect_iter = new_vertical_heap_merge_iterator( + segment_iters, read_params.tablet->keys_type(), seq_col_idx, _row_sources_buffer); + } else { + auto ori_return_col_size = _return_columns.size(); + _vcollect_iter = new_vertical_mask_merge_iterator(segment_iters, ori_return_col_size, + _row_sources_buffer); + } + // init collect iterator + StorageReadOptions opts; + RETURN_IF_ERROR(_vcollect_iter->init(opts)); + + // In dup keys value columns compact, get first row for _init_agg_state + if (!read_params.is_key_column_group && read_params.tablet->keys_type() == KeysType::AGG_KEYS) { + auto st = _vcollect_iter->next_row(&_next_row); + _eof = st.is_end_of_file(); + } + + return Status::OK(); +} + +void VerticalBlockReader::_init_agg_state(const ReaderParams& read_params) { + if (_eof) { + return; + } + DCHECK(_return_columns.size() == _next_row.block->columns()); + _stored_data_columns = _next_row.block->create_same_struct_block(_batch_size)->mutate_columns(); + + _stored_has_null_tag.resize(_stored_data_columns.size()); + _stored_has_string_tag.resize(_stored_data_columns.size()); + + auto& tablet_schema = *_tablet_schema; + for (size_t idx = 0; idx < _return_columns.size(); ++idx) { + AggregateFunctionPtr function = + tablet_schema.column(_return_columns.at(idx)) + .get_aggregate_function({_next_row.block->get_data_type(idx)}, + vectorized::AGG_READER_SUFFIX); + DCHECK(function != nullptr); + _agg_functions.push_back(function); + // create aggregate data + AggregateDataPtr place = new char[function->size_of_data()]; + function->create(place); + _agg_places.push_back(place); + + // calculate `has_string` tag. + _stored_has_string_tag[idx] = + _stored_data_columns[idx]->is_column_string() || + (_stored_data_columns[idx]->is_nullable() && + reinterpret_cast(_stored_data_columns[idx].get()) + ->get_nested_column_ptr() + ->is_column_string()); + } +} + +Status VerticalBlockReader::init(const ReaderParams& read_params) { + _batch_size = 4096; + RETURN_NOT_OK(TabletReader::init(read_params)); + + std::vector rs_readers; + auto status = _init_collect_iter(read_params); + if (!status.ok()) { + return status; + } + + switch (tablet()->keys_type()) { + case KeysType::DUP_KEYS: + _next_block_func = &VerticalBlockReader::_direct_next_block; + break; + case KeysType::UNIQUE_KEYS: + _next_block_func = &VerticalBlockReader::_unique_key_next_block; + if (_filter_delete) { + _delete_filter_column = ColumnUInt8::create(); + } + break; + case KeysType::AGG_KEYS: + _next_block_func = &VerticalBlockReader::_agg_key_next_block; + if (!read_params.is_key_column_group) { + _init_agg_state(read_params); + } + break; + default: + DCHECK(false) << "No next row function for type:" << tablet()->keys_type(); + break; + } + return Status::OK(); +} + +Status VerticalBlockReader::_direct_next_block(Block* block, MemPool* mem_pool, + ObjectPool* agg_pool, bool* eof) { + auto res = _vcollect_iter->next_batch(block); + if (UNLIKELY(!res.ok() && !res.is_end_of_file())) { + return res; + } + *eof = (res.is_end_of_file()); + _eof = *eof; + return Status::OK(); +} + +void VerticalBlockReader::_append_agg_data(MutableColumns& columns) { + _stored_row_ref.push_back(_next_row); + _last_agg_data_counter++; + + // execute aggregate when have `batch_size` column or some ref invalid soon + bool is_last = (_next_row.block->rows() == _next_row.row_pos + 1); + if (is_last || _stored_row_ref.size() == _batch_size) { + _update_agg_data(columns); + } +} + +void VerticalBlockReader::_update_agg_data(MutableColumns& columns) { + // copy data to stored block + size_t copy_size = _copy_agg_data(); + + // calculate has_null_tag + for (size_t idx = 0; idx < _return_columns.size(); ++idx) { + _stored_has_null_tag[idx] = _stored_data_columns[idx]->has_null(copy_size); + } + + // calculate aggregate and insert + int counter_sum = 0; + for (int counter : _agg_data_counters) { + _update_agg_value(columns, counter_sum, counter_sum + counter - 1); + counter_sum += counter; + } + + // some key still has value at next block, so do not insert + if (_last_agg_data_counter) { + _update_agg_value(columns, counter_sum, counter_sum + _last_agg_data_counter - 1, false); + _last_agg_data_counter = 0; + } + + _agg_data_counters.clear(); +} + +void VerticalBlockReader::_update_agg_value(MutableColumns& columns, int begin, int end, + bool is_close) { + for (size_t idx = 0; idx < _return_columns.size(); ++idx) { + AggregateFunctionPtr function = _agg_functions[idx]; + AggregateDataPtr place = _agg_places[idx]; + auto column_ptr = _stored_data_columns[idx].get(); + + if (begin <= end) { + function->add_batch_range(begin, end, place, const_cast(&column_ptr), + nullptr, _stored_has_null_tag[idx]); + } + + if (is_close) { + function->insert_result_into(place, *columns[idx]); + // reset aggregate data + function->destroy(place); + function->create(place); + } + } +} + +size_t VerticalBlockReader::_copy_agg_data() { + size_t copy_size = _stored_row_ref.size(); + + for (size_t i = 0; i < copy_size; i++) { + auto& ref = _stored_row_ref[i]; + _temp_ref_map[ref.block.get()].emplace_back(ref.row_pos, i); + } + for (size_t idx = 0; idx < _return_columns.size(); ++idx) { + auto& dst_column = _stored_data_columns[idx]; + if (_stored_has_string_tag[idx]) { + //string type should replace ordered + for (size_t i = 0; i < copy_size; i++) { + auto& ref = _stored_row_ref[i]; + dst_column->replace_column_data(*ref.block->get_by_position(idx).column, + ref.row_pos, i); + } + } else { + for (auto& it : _temp_ref_map) { + if (!it.second.empty()) { + auto& src_column = *it.first->get_by_position(idx).column; + for (auto& pos : it.second) { + dst_column->replace_column_data(src_column, pos.first, pos.second); + } + } + } + } + } + + for (auto& it : _temp_ref_map) { + it.second.clear(); + } + _stored_row_ref.clear(); + + return copy_size; +} + +Status VerticalBlockReader::_agg_key_next_block(Block* block, MemPool* mem_pool, + ObjectPool* agg_pool, bool* eof) { + if (_reader_context.is_key_column_group) { + // collect_iter will filter agg keys + auto res = _vcollect_iter->next_batch(block); + if (UNLIKELY(!res.ok() && !res.is_end_of_file())) { + return res; + } + *eof = (res.is_end_of_file()); + _eof = *eof; + return Status::OK(); + } + // handle value agg + if (UNLIKELY(_eof)) { + *eof = true; + return Status::OK(); + } + int target_block_row = 0; + auto target_columns = block->mutate_columns(); + + // copy first row get from collect_iter in init + _append_agg_data(target_columns); + target_block_row++; + + do { + Status res = _vcollect_iter->next_row(&_next_row); + if (UNLIKELY(!res.ok())) { + if (UNLIKELY(res.is_end_of_file())) { + *eof = true; + _eof = true; + break; + } + LOG(WARNING) << "next failed: " << res; + return res; + } + DCHECK(_next_row.block->columns() == block->columns()); + if (!_next_row.is_same) { + if (target_block_row == _batch_size) { + break; + } + _agg_data_counters.push_back(_last_agg_data_counter); + _last_agg_data_counter = 0; + target_block_row++; + } + _append_agg_data(target_columns); + } while (true); + + _agg_data_counters.push_back(_last_agg_data_counter); + _last_agg_data_counter = 0; + _update_agg_data(target_columns); + + return Status::OK(); +} + +Status VerticalBlockReader::_unique_key_next_block(Block* block, MemPool* mem_pool, + ObjectPool* agg_pool, bool* eof) { + if (_reader_context.is_key_column_group) { + // Record row_source_buffer current size for key column agg flag + // _vcollect_iter->next_batch(block) will fill row_source_buffer but delete sign is ignored + // we calc delete sign column if it's base compaction and update row_sourece_buffer's agg flag + // after we get current block + auto row_source_idx = _row_sources_buffer->buffered_size(); + + auto res = _vcollect_iter->next_batch(block); + if (UNLIKELY(!res.ok() && !res.is_end_of_file())) { + return res; + } + auto block_rows = block->rows(); + if (_filter_delete && block_rows > 0) { + auto target_columns = block->mutate_columns(); + int delete_sign_idx = block->columns() - 1; + DCHECK(delete_sign_idx > 0); + MutableColumnPtr delete_filter_column = (*std::move(_delete_filter_column)).mutate(); + reinterpret_cast(delete_filter_column.get())->resize(block_rows); + + auto* __restrict filter_data = + reinterpret_cast(delete_filter_column.get())->get_data().data(); + auto* __restrict delete_data = + reinterpret_cast(target_columns[delete_sign_idx].get()) + ->get_data() + .data(); + for (int i = 0; i < block_rows; ++i) { + bool sign = (delete_data[i] == 0); + filter_data[i] = sign; + if (UNLIKELY(!sign)) { + _row_sources_buffer->set_agg_flag(row_source_idx + i, true); + } + } + + ColumnWithTypeAndName column_with_type_and_name {_delete_filter_column, + std::make_shared(), + "__DORIS_COMPACTION_FILTER__"}; + block->insert(column_with_type_and_name); + Block::filter_block(block, target_columns.size(), target_columns.size()); + _stats.rows_del_filtered += block_rows - block->rows(); + DCHECK(block->try_get_by_name("__DORIS_COMPACTION_FILTER__") == nullptr); + } + *eof = (res.is_end_of_file()); + _eof = *eof; + return Status::OK(); + } + int target_block_row = 0; + auto target_columns = block->mutate_columns(); + size_t column_count = block->columns(); + do { + Status res = _vcollect_iter->unique_key_next_row(&_next_row); + if (UNLIKELY(!res.ok())) { + if (UNLIKELY(res.is_end_of_file())) { + *eof = true; + _eof = true; + break; + } + LOG(WARNING) << "next failed: " << res; + return res; + } + const auto& src_block = _next_row.block; + assert(src_block->columns() == column_count); + for (size_t i = 0; i < column_count; ++i) { + target_columns[i]->insert_from(*(src_block->get_by_position(i).column), + _next_row.row_pos); + } + ++target_block_row; + } while (target_block_row < _batch_size); + return Status::OK(); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/olap/vertical_block_reader.h b/be/src/vec/olap/vertical_block_reader.h new file mode 100644 index 00000000000000..7c2e99eacfc58c --- /dev/null +++ b/be/src/vec/olap/vertical_block_reader.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "olap/reader.h" +#include "olap/rowset/rowset_reader.h" +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/olap/vcollect_iterator.h" +#include "vec/olap/vertical_merge_iterator.h" + +#pragma once + +namespace doris { + +namespace vectorized { + +class VerticalBlockReader final : public TabletReader { +public: + VerticalBlockReader(RowSourcesBuffer* row_sources_buffer) + : _row_sources_buffer(row_sources_buffer) {} + + ~VerticalBlockReader() override; + + // Initialize VerticalBlockReader with tablet, data version and fetch range. + Status init(const ReaderParams& read_params) override; + + Status next_block_with_aggregation(Block* block, MemPool* mem_pool, ObjectPool* agg_pool, + bool* eof) override { + return (this->*_next_block_func)(block, mem_pool, agg_pool, eof); + } + + Status next_row_with_aggregation(RowCursor* row_cursor, MemPool* mem_pool, ObjectPool* agg_pool, + bool* eof) override { + return Status::OK(); + } + + uint64_t merged_rows() const override { + DCHECK(_vcollect_iter); + return _vcollect_iter->merged_rows(); + } + +private: + // Directly read row from rowset and pass to upper caller. No need to do aggregation. + // This is usually used for DUPLICATE KEY tables + Status _direct_next_block(Block* block, MemPool* mem_pool, ObjectPool* agg_pool, bool* eof); + // For normal AGGREGATE KEY tables, read data by a merge heap. + Status _agg_key_next_block(Block* block, MemPool* mem_pool, ObjectPool* agg_pool, bool* eof); + // For UNIQUE KEY tables, read data by a merge heap. + // The difference from _agg_key_next_block is that it will read the data from high version to low version, + // to minimize the comparison time in merge heap. + Status _unique_key_next_block(Block* block, MemPool* mem_pool, ObjectPool* agg_pool, bool* eof); + + Status _init_collect_iter(const ReaderParams& read_params); + + Status _get_segment_iterators(const ReaderParams& read_params, + std::vector* segment_iters); + + void _init_agg_state(const ReaderParams& read_params); + void _append_agg_data(MutableColumns& columns); + void _update_agg_data(MutableColumns& columns); + size_t _copy_agg_data(); + void _update_agg_value(MutableColumns& columns, int begin, int end, bool is_close = true); + +private: + std::shared_ptr _vcollect_iter; + IteratorRowRef _next_row {{}, -1, false}; + + bool _eof = false; + + Status (VerticalBlockReader::*_next_block_func)(Block* block, MemPool* mem_pool, + ObjectPool* agg_pool, bool* eof) = nullptr; + + RowSourcesBuffer* _row_sources_buffer; + ColumnPtr _delete_filter_column; + + // for agg mode + std::vector _agg_functions; + std::vector _agg_places; + + std::vector _normal_columns_idx; + std::vector _agg_columns_idx; + + std::vector _agg_data_counters; + int _last_agg_data_counter = 0; + + MutableColumns _stored_data_columns; + std::vector _stored_row_ref; + + std::vector _stored_has_null_tag; + std::vector _stored_has_string_tag; + + phmap::flat_hash_map>> _temp_ref_map; +}; + +} // namespace vectorized +} // namespace doris diff --git a/be/src/vec/olap/vertical_merge_iterator.cpp b/be/src/vec/olap/vertical_merge_iterator.cpp new file mode 100644 index 00000000000000..2755c204e1eb75 --- /dev/null +++ b/be/src/vec/olap/vertical_merge_iterator.cpp @@ -0,0 +1,545 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/olap/vertical_merge_iterator.h" + +namespace doris { + +namespace vectorized { + +// -------------- row source ---------------// +RowSource::RowSource(uint16_t source_num, bool agg_flag) { + _data = (source_num & SOURCE_FLAG) | (source_num & AGG_FLAG); + _data = agg_flag ? (_data | AGG_FLAG) : (_data & SOURCE_FLAG); +} + +uint16_t RowSource::get_source_num() { + return _data & SOURCE_FLAG; +} + +bool RowSource::agg_flag() { + return (_data & AGG_FLAG) != 0; +} + +void RowSource::set_agg_flag(bool agg_flag) { + _data = agg_flag ? (_data | AGG_FLAG) : (_data & SOURCE_FLAG); +} + +uint16_t RowSource::data() const { + return _data; +} + +/* -------------- row source buffer ------------- */ + +// current row_sources must save in memory so agg key can update agg flag +Status RowSourcesBuffer::append(const std::vector& row_sources) { + if (_buffer->allocated_bytes() + row_sources.size() * sizeof(UInt16) > + config::vertical_compaction_max_row_source_memory_mb * 1024 * 1024) { + // serialize current buffer + RETURN_IF_ERROR(_create_buffer_file()); + RETURN_IF_ERROR(_serialize()); + _reset_buffer(); + } + for (const auto& source : row_sources) { + _buffer->insert_value(source.data()); + } + _total_size += row_sources.size(); + return Status::OK(); +} + +Status RowSourcesBuffer::seek_to_begin() { + _buf_idx = 0; + if (_fd > 0) { + auto offset = lseek(_fd, 0, SEEK_SET); + if (offset != 0) { + LOG(WARNING) << "failed to seek to 0"; + return Status::InternalError("failed to seek to 0"); + } + _reset_buffer(); + } + return Status::OK(); +} + +Status RowSourcesBuffer::has_remaining() { + if (_buf_idx < _buffer->size()) { + return Status::OK(); + } + DCHECK(_buf_idx == _buffer->size()); + if (_fd > 0) { + _reset_buffer(); + auto st = _deserialize(); + if (!st.ok()) { + return st; + } + return Status::OK(); + } + return Status::EndOfFile("end of row source buffer"); +} + +void RowSourcesBuffer::set_agg_flag(uint64_t index, bool agg) { + DCHECK(index < _buffer->size()); + RowSource ori(_buffer->get_data()[index]); + ori.set_agg_flag(agg); + _buffer->get_data()[index] = ori.data(); +} + +size_t RowSourcesBuffer::same_source_count(uint16_t source, size_t limit) { + int result = 1; + int start = _buf_idx + 1; + int end = _buffer->size(); + while (result < limit && start < end) { + RowSource next(_buffer->get_element(start++)); + if (source != next.get_source_num()) { + break; + } + ++result; + } + return result; +} + +Status RowSourcesBuffer::_create_buffer_file() { + if (_fd != -1) { + return Status::OK(); + } + std::stringstream file_path; + file_path << _tablet_path << "/compaction_row_source_" << _tablet_id; + if (_reader_type == READER_BASE_COMPACTION) { + file_path << "_base"; + } else if (_reader_type == READER_CUMULATIVE_COMPACTION) { + file_path << "_cumu"; + } else { + DCHECK(false); + return Status::InternalError("unknown reader type"); + } + file_path << ".XXXXXX"; + LOG(INFO) << "Vertical compaction row sources buffer path: " << file_path.str(); + _fd = mkstemp(file_path.str().data()); + if (_fd < 0) { + LOG(WARNING) << "failed to create tmp file, file_path=" << file_path.str(); + return Status::InternalError("failed to create tmp file"); + } + // file will be released after fd is close + unlink(file_path.str().data()); + return Status::OK(); +} + +Status RowSourcesBuffer::flush() { + if (_fd > 0 && !_buffer->empty()) { + RETURN_IF_ERROR(_serialize()); + _reset_buffer(); + } + return Status::OK(); +} + +Status RowSourcesBuffer::_serialize() { + size_t rows = _buffer->size(); + if (rows == 0) { + return Status::OK(); + } + // write size + ssize_t bytes_written = ::write(_fd, &rows, sizeof(rows)); + if (bytes_written != sizeof(size_t)) { + LOG(WARNING) << "failed to write buffer size to file, bytes_written=" << bytes_written; + return Status::InternalError("fail to write buffer size to file"); + } + // write data + StringRef ref = _buffer->get_raw_data(); + bytes_written = ::write(_fd, ref.data, ref.size * sizeof(UInt16)); + if (bytes_written != _buffer->byte_size()) { + LOG(WARNING) << "failed to write buffer data to file, bytes_written=" << bytes_written + << " buffer size=" << _buffer->byte_size(); + return Status::InternalError("fail to write buffer size to file"); + } + return Status::OK(); +} + +Status RowSourcesBuffer::_deserialize() { + size_t rows = 0; + ssize_t bytes_read = ::read(_fd, &rows, sizeof(rows)); + if (bytes_read == 0) { + LOG(WARNING) << "end of row source buffer file"; + return Status::EndOfFile("end of row source buffer file"); + } else if (bytes_read != sizeof(size_t)) { + LOG(WARNING) << "failed to read buffer size from file, bytes_read=" << bytes_read; + return Status::InternalError("failed to read buffer size from file"); + } + _buffer->resize(rows); + auto& internal_data = _buffer->get_data(); + bytes_read = ::read(_fd, internal_data.data(), rows * sizeof(UInt16)); + if (bytes_read != rows * sizeof(UInt16)) { + LOG(WARNING) << "failed to read buffer data from file, bytes_read=" << bytes_read + << ", expect bytes=" << rows * sizeof(UInt16); + return Status::InternalError("failed to read buffer data from file"); + } + return Status::OK(); +} + +// ---------- vertical merge iterator context ----------// +Status VerticalMergeIteratorContext::block_reset(const std::shared_ptr& block) { + if (!*block) { + const Schema& schema = _iter->schema(); + const auto& column_ids = schema.column_ids(); + for (size_t i = 0; i < schema.num_column_ids(); ++i) { + auto column_desc = schema.column(column_ids[i]); + auto data_type = Schema::get_data_type_ptr(*column_desc); + if (data_type == nullptr) { + return Status::RuntimeError("invalid data type"); + } + auto column = data_type->create_column(); + column->reserve(_block_row_max); + block->insert(ColumnWithTypeAndName(std::move(column), data_type, column_desc->name())); + } + } else { + block->clear_column_data(); + } + return Status::OK(); +} + +bool VerticalMergeIteratorContext::compare(const VerticalMergeIteratorContext& rhs) const { + int cmp_res = _block->compare_at(_index_in_block, rhs._index_in_block, _num_key_columns, + *rhs._block, -1); + if (cmp_res != 0) { + return cmp_res > 0; + } + auto col_cmp_res = 0; + if (_seq_col_idx != -1) { + DCHECK(_block->columns() > 2); + auto real_seq_idx = _block->columns() - 2; + col_cmp_res = _block->compare_column_at(_index_in_block, rhs._index_in_block, real_seq_idx, + *rhs._block, -1); + } + auto result = (col_cmp_res == 0) ? (_order < rhs.order()) : (col_cmp_res < 0); + result ? set_is_same(true) : rhs.set_is_same(true); + return result; +} + +void VerticalMergeIteratorContext::copy_rows(Block* block, size_t count) { + Block& src = *_block; + Block& dst = *block; + DCHECK(count > 0); + + auto start = _index_in_block; + _index_in_block += count - 1; + + for (size_t i = 0; i < _ori_return_cols; ++i) { + auto& s_col = src.get_by_position(i); + auto& d_col = dst.get_by_position(i); + + ColumnPtr& s_cp = s_col.column; + ColumnPtr& d_cp = d_col.column; + + d_cp->assume_mutable()->insert_range_from(*s_cp, start, count); + } +} +// `advanced = false` when current block finished +void VerticalMergeIteratorContext::copy_rows(Block* block, bool advanced) { + Block& src = *_block; + Block& dst = *block; + if (_cur_batch_num == 0) { + return; + } + + // copy a row to dst block column by column + size_t start = _index_in_block - _cur_batch_num + 1 - advanced; + DCHECK(start >= 0); + + for (size_t i = 0; i < _ori_return_cols; ++i) { + auto& s_col = src.get_by_position(i); + auto& d_col = dst.get_by_position(i); + + ColumnPtr& s_cp = s_col.column; + ColumnPtr& d_cp = d_col.column; + + d_cp->assume_mutable()->insert_range_from(*s_cp, start, _cur_batch_num); + } + _cur_batch_num = 0; +} + +Status VerticalMergeIteratorContext::init(const StorageReadOptions& opts) { + _block_row_max = opts.block_row_max; + RETURN_IF_ERROR(_load_next_block()); + if (valid()) { + RETURN_IF_ERROR(advance()); + } + return Status::OK(); +} + +Status VerticalMergeIteratorContext::advance() { + // NOTE: we increase _index_in_block directly to valid one check + do { + _index_in_block++; + if (LIKELY(_index_in_block < _block->rows())) { + return Status::OK(); + } + // current batch has no data, load next batch + RETURN_IF_ERROR(_load_next_block()); + } while (_valid); + return Status::OK(); +} + +Status VerticalMergeIteratorContext::_load_next_block() { + do { + if (_block != nullptr) { + _block_list.push_back(_block); + _block = nullptr; + } + for (auto it = _block_list.begin(); it != _block_list.end(); it++) { + if (it->use_count() == 1) { + block_reset(*it); + _block = *it; + _block_list.erase(it); + break; + } + } + if (_block == nullptr) { + _block = std::make_shared(); + block_reset(_block); + } + Status st = _iter->next_batch(_block.get()); + if (!st.ok()) { + _valid = false; + if (st.is_end_of_file()) { + return Status::OK(); + } else { + return st; + } + } + // erase delete handler columns + if (_num_columns > _ori_return_cols) { + for (auto i = _num_columns - 1; i >= _ori_return_cols; --i) { + _block->erase(i); + } + } + } while (_block->rows() == 0); + _index_in_block = -1; + _valid = true; + return Status::OK(); +} + +// ---------------- VerticalHeapMergeIterator ------------- // +Status VerticalHeapMergeIterator::next_batch(Block* block) { + size_t row_idx = 0; + VerticalMergeIteratorContext* pre_ctx = nullptr; + std::vector tmp_row_sources; + while (_get_size(block) < _block_row_max) { + if (_merge_heap.empty()) { + LOG(INFO) << "_merge_heap empty"; + break; + } + + auto ctx = _merge_heap.top(); + _merge_heap.pop(); + if (ctx->is_same()) { + tmp_row_sources.emplace_back(ctx->order(), true); + } else { + tmp_row_sources.emplace_back(ctx->order(), false); + } + if (ctx->is_same() && + (_keys_type == KeysType::UNIQUE_KEYS || _keys_type == KeysType::AGG_KEYS)) { + // skip cur row, copy pre ctx + ++_merged_rows; + if (pre_ctx) { + pre_ctx->copy_rows(block); + pre_ctx = nullptr; + } + } else { + ctx->add_cur_batch(); + if (pre_ctx != ctx) { + if (pre_ctx) { + pre_ctx->copy_rows(block); + } + pre_ctx = ctx; + } + row_idx++; + if (ctx->is_cur_block_finished() || row_idx >= _block_row_max) { + // current block finished, ctx not advance + // so copy start_idx = (_index_in_block - _cur_batch_num + 1) + ctx->copy_rows(block, false); + pre_ctx = nullptr; + } + } + + RETURN_IF_ERROR(ctx->advance()); + if (ctx->valid()) { + _merge_heap.push(ctx); + } else { + // Release ctx earlier to reduce resource consumed + delete ctx; + } + } + RETURN_IF_ERROR(_row_sources_buf->append(tmp_row_sources)); + if (!_merge_heap.empty()) { + return Status::OK(); + } + return Status::EndOfFile("no more data in segment"); +} + +Status VerticalHeapMergeIterator::init(const StorageReadOptions& opts) { + if (_origin_iters.empty()) { + return Status::OK(); + } + _schema = &(*_origin_iters.begin())->schema(); + + auto seg_order = 0; + for (auto iter : _origin_iters) { + auto ctx = std::make_unique( + iter, iter->schema().num_column_ids(), seg_order, _seq_col_idx); + RETURN_IF_ERROR(ctx->init(opts)); + if (!ctx->valid()) { + continue; + } + _merge_heap.push(ctx.release()); + ++seg_order; + } + _origin_iters.clear(); + + _block_row_max = opts.block_row_max; + return Status::OK(); +} + +// ---------------- VerticalMaskMergeIterator ------------- // +Status VerticalMaskMergeIterator::next_row(vectorized::IteratorRowRef* ref) { + DCHECK(_row_sources_buf); + auto st = _row_sources_buf->has_remaining(); + if (!st.ok()) { + if (st.is_end_of_file()) { + for (auto iter : _origin_iter_ctx) { + RETURN_IF_ERROR(iter->advance()); + DCHECK(!iter->valid()); + } + } + return st; + } + auto row_source = _row_sources_buf->current(); + uint16_t order = row_source.get_source_num(); + auto& ctx = _origin_iter_ctx[order]; + if (UNLIKELY(ctx->is_first_row())) { + // first row in block, don't call ctx->advance + // Except first row, we call advance first and than get cur row + ctx->set_cur_row_ref(ref); + ref->is_same = row_source.agg_flag(); + + ctx->set_is_first_row(false); + _row_sources_buf->advance(); + return Status::OK(); + } + RETURN_IF_ERROR(ctx->advance()); + ctx->set_cur_row_ref(ref); + ref->is_same = row_source.agg_flag(); + + _row_sources_buf->advance(); + return Status::OK(); +} + +Status VerticalMaskMergeIterator::unique_key_next_row(vectorized::IteratorRowRef* ref) { + DCHECK(_row_sources_buf); + auto st = _row_sources_buf->has_remaining(); + while (st.ok()) { + auto row_source = _row_sources_buf->current(); + uint16_t order = row_source.get_source_num(); + auto& ctx = _origin_iter_ctx[order]; + if (UNLIKELY(ctx->is_first_row()) && !row_source.agg_flag()) { + // first row in block, don't call ctx->advance + // Except first row, we call advance first and than get cur row + ctx->set_cur_row_ref(ref); + ctx->set_is_first_row(false); + _row_sources_buf->advance(); + return Status::OK(); + } + RETURN_IF_ERROR(ctx->advance()); + _row_sources_buf->advance(); + if (!row_source.agg_flag()) { + ctx->set_cur_row_ref(ref); + return Status::OK(); + } + st = _row_sources_buf->has_remaining(); + } + if (st.is_end_of_file()) { + for (auto iter : _origin_iter_ctx) { + RETURN_IF_ERROR(iter->advance()); + DCHECK(!iter->valid()); + } + } + return st; +} + +Status VerticalMaskMergeIterator::next_batch(Block* block) { + DCHECK(_row_sources_buf); + size_t rows = 0; + auto st = _row_sources_buf->has_remaining(); + while (rows < _block_row_max && st.ok()) { + uint16_t order = _row_sources_buf->current().get_source_num(); + DCHECK(order < _origin_iter_ctx.size()); + auto& ctx = _origin_iter_ctx[order]; + + // find max same source count in cur ctx + size_t limit = std::min(ctx->remain_rows(), _block_row_max - rows); + auto same_source_cnt = _row_sources_buf->same_source_count(order, limit); + _row_sources_buf->advance(same_source_cnt); + // copy rows to block + ctx->copy_rows(block, same_source_cnt); + RETURN_IF_ERROR(ctx->advance()); + rows += same_source_cnt; + st = _row_sources_buf->has_remaining(); + } + if (st.is_end_of_file()) { + for (auto iter : _origin_iter_ctx) { + RETURN_IF_ERROR(iter->advance()); + DCHECK(!iter->valid()); + } + } + return st; +} + +Status VerticalMaskMergeIterator::init(const StorageReadOptions& opts) { + if (_origin_iters.empty()) { + return Status::OK(); + } + _schema = &(*_origin_iters.begin())->schema(); + + for (auto iter : _origin_iters) { + auto ctx = std::make_unique(iter, _ori_return_cols, -1, -1); + RETURN_IF_ERROR(ctx->init(opts)); + if (!ctx->valid()) { + continue; + } + _origin_iter_ctx.emplace_back(ctx.release()); + } + + _origin_iters.clear(); + + _block_row_max = opts.block_row_max; + return Status::OK(); +} + +// interfaces to create vertical merge iterator +std::shared_ptr new_vertical_heap_merge_iterator( + const std::vector& inputs, KeysType keys_type, uint32_t seq_col_idx, + RowSourcesBuffer* row_sources) { + return std::make_shared(std::move(inputs), keys_type, seq_col_idx, + row_sources); +} + +std::shared_ptr new_vertical_mask_merge_iterator( + const std::vector& inputs, size_t ori_return_cols, + RowSourcesBuffer* row_sources) { + return std::make_shared(std::move(inputs), ori_return_cols, + row_sources); +} + +} // namespace vectorized +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/olap/vertical_merge_iterator.h b/be/src/vec/olap/vertical_merge_iterator.h new file mode 100644 index 00000000000000..d2c66b1ed53939 --- /dev/null +++ b/be/src/vec/olap/vertical_merge_iterator.h @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "common/status.h" +#include "olap/iterators.h" +#include "olap/schema.h" +#include "vec/columns/columns_number.h" +#include "vec/core/block.h" + +#pragma once + +namespace doris { + +namespace vectorized { + +// Row source represent row location in multi-segments +// use a uint16_t to store info +// the lower 15 bits means segment_id in segment pool, and the higher 1 bits means agg flag. +// In unique-key, agg flags means this key should be deleted, this comes from two way: old version +// key or delete_sign. +class RowSource { +public: + RowSource(uint16_t data) : _data(data) {} + RowSource(uint16_t source_num, bool agg_flag); + + uint16_t get_source_num(); + bool agg_flag(); + void set_agg_flag(bool agg_flag); + uint16_t data() const; + +private: + uint16_t _data; + static const uint16_t SOURCE_FLAG = 0x7FFF; + static const uint16_t AGG_FLAG = 0x8000; +}; + +/* rows source buffer +this buffer should have a memory limit, once reach memory limit, write +buffer data to tmp file. +usage: + RowSourcesBuffer buffer(tablet_id, tablet_storage_path, reader_type); + buffer.append() + buffer.append() + buffer.flush() + buffer.seek_to_begin() + while (buffer.has_remaining().ok()) { + auto cur = buffer.current().get_source_num(); + auto same = buffer.same_source_count(cur, limit); + // do copy block data + buffer.advance(same); + } +*/ +class RowSourcesBuffer { +public: + RowSourcesBuffer(int64_t tablet_id, const std::string& tablet_path, ReaderType reader_type) + : _tablet_id(tablet_id), + _tablet_path(tablet_path), + _reader_type(reader_type), + _buffer(ColumnUInt16::create()) {} + + ~RowSourcesBuffer() { + _reset_buffer(); + if (_fd > 0) { + ::close(_fd); + } + } + + // write batch row source + Status append(const std::vector& row_sources); + Status flush(); + + RowSource current() { + DCHECK(_buf_idx < _buffer->size()); + return RowSource(_buffer->get_element(_buf_idx)); + } + void advance(int32_t step = 1) { + DCHECK(_buf_idx + step <= _buffer->size()); + _buf_idx += step; + } + + uint64_t buf_idx() { return _buf_idx; } + uint64_t total_size() { return _total_size; } + uint64_t buffered_size() { return _buffer->size(); } + void set_agg_flag(uint64_t index, bool agg); + + Status has_remaining(); + + Status seek_to_begin(); + + size_t same_source_count(uint16_t source, size_t limit); + +private: + Status _create_buffer_file(); + Status _serialize(); + Status _deserialize(); + void _reset_buffer() { + _buffer->clear(); + _buf_idx = 0; + } + +private: + int64_t _tablet_id; + std::string _tablet_path; + ReaderType _reader_type; + uint64_t _buf_idx = 0; + int _fd = -1; + ColumnUInt16::MutablePtr _buffer; + uint64_t _total_size = 0; +}; + +// --------------- VerticalMergeIteratorContext ------------- // +// takes ownership of rowwise iterator +class VerticalMergeIteratorContext { +public: + VerticalMergeIteratorContext(RowwiseIterator* iter, size_t ori_return_cols, uint32_t order, + uint32_t seq_col_idx) + : _iter(iter), + _ori_return_cols(ori_return_cols), + _order(order), + _seq_col_idx(seq_col_idx), + _num_columns(iter->schema().num_column_ids()), + _num_key_columns(iter->schema().num_key_columns()) {} + + VerticalMergeIteratorContext(const VerticalMergeIteratorContext&) = delete; + VerticalMergeIteratorContext(VerticalMergeIteratorContext&&) = delete; + VerticalMergeIteratorContext& operator=(const VerticalMergeIteratorContext&) = delete; + VerticalMergeIteratorContext& operator=(VerticalMergeIteratorContext&&) = delete; + + ~VerticalMergeIteratorContext() { + delete _iter; + _iter = nullptr; + } + Status block_reset(const std::shared_ptr& block); + Status init(const StorageReadOptions& opts); + bool compare(const VerticalMergeIteratorContext& rhs) const; + void copy_rows(Block* block, bool advanced = true); + void copy_rows(Block* block, size_t count); + + Status advance(); + + // Return if it has remaining data in this context. + // Only when this function return true, current_row() + // will return a valid row + bool valid() const { return _valid; } + + uint32_t order() const { return _order; } + + void set_is_same(bool is_same) const { _is_same = is_same; } + + bool is_same() { return _is_same; } + + void add_cur_batch() { _cur_batch_num++; } + + void reset_cur_batch() { _cur_batch_num = 0; } + + bool is_cur_block_finished() { return _index_in_block == _block->rows() - 1; } + + size_t remain_rows() { return _block->rows() - _index_in_block; } + + bool is_first_row() { return _is_first_row; } + void set_is_first_row(bool is_first_row) { _is_first_row = is_first_row; } + void set_cur_row_ref(vectorized::IteratorRowRef* ref) { + ref->block = _block; + ref->row_pos = _index_in_block; + } + +private: + // Load next block into _block + Status _load_next_block(); + + RowwiseIterator* _iter; + size_t _ori_return_cols = 0; + + // segment order, used to compare key + uint32_t _order = -1; + + uint32_t _seq_col_idx = -1; + + bool _valid = false; + mutable bool _is_same = false; + size_t _index_in_block = -1; + // 4096 minus 16 + 16 bytes padding that in padding pod array + int _block_row_max = 4064; + int _num_columns; + int _num_key_columns; + size_t _cur_batch_num = 0; + + // used to store data load from iterator->next_batch(Block*) + std::shared_ptr _block; + // used to store data still on block view + std::list> _block_list; + // use to identify whether it's first block load from RowwiseIterator + bool _is_first_row = true; +}; + +// --------------- VerticalHeapMergeIterator ------------- // +class VerticalHeapMergeIterator : public RowwiseIterator { +public: + // VerticalMergeIterator takes the ownership of input iterators + VerticalHeapMergeIterator(std::vector iters, KeysType keys_type, + int32_t seq_col_idx, RowSourcesBuffer* row_sources_buf) + : _origin_iters(std::move(iters)), + _keys_type(keys_type), + _seq_col_idx(seq_col_idx), + _row_sources_buf(row_sources_buf) {} + + ~VerticalHeapMergeIterator() override { + while (!_merge_heap.empty()) { + auto ctx = _merge_heap.top(); + _merge_heap.pop(); + delete ctx; + } + } + + Status init(const StorageReadOptions& opts) override; + Status next_batch(Block* block) override; + const Schema& schema() const override { return *_schema; } + uint64_t merged_rows() const override { return _merged_rows; } + +private: + int _get_size(Block* block) { return block->rows(); } + +private: + // It will be released after '_merge_heap' has been built. + std::vector _origin_iters; + + const Schema* _schema = nullptr; + + struct VerticalMergeContextComparator { + bool operator()(const VerticalMergeIteratorContext* lhs, + const VerticalMergeIteratorContext* rhs) const { + return lhs->compare(*rhs); + } + }; + + using VMergeHeap = std::priority_queue, + VerticalMergeContextComparator>; + + VMergeHeap _merge_heap; + int _block_row_max = 0; + KeysType _keys_type; + int32_t _seq_col_idx = -1; + RowSourcesBuffer* _row_sources_buf; + uint32_t _merged_rows = 0; +}; + +// --------------- VerticalMaskMergeIterator ------------- // +class VerticalMaskMergeIterator : public RowwiseIterator { +public: + // VerticalMaskMergeIterator takes the ownership of input iterators + VerticalMaskMergeIterator(std::vector iters, size_t ori_return_cols, + RowSourcesBuffer* row_sources_buf) + : _origin_iters(std::move(iters)), + _ori_return_cols(ori_return_cols), + _row_sources_buf(row_sources_buf) {} + + ~VerticalMaskMergeIterator() override { + for (auto iter : _origin_iter_ctx) { + delete iter; + } + } + + Status init(const StorageReadOptions& opts) override; + + Status next_batch(Block* block) override; + + const Schema& schema() const override { return *_schema; } + + Status next_row(IteratorRowRef* ref) override; + + Status unique_key_next_row(IteratorRowRef* ref) override; + +private: + int _get_size(Block* block) { return block->rows(); } + +private: + // released after build ctx + std::vector _origin_iters; + size_t _ori_return_cols = 0; + + std::vector _origin_iter_ctx; + + const Schema* _schema = nullptr; + + int _block_row_max = 0; + RowSourcesBuffer* _row_sources_buf; +}; + +// segment merge iterator +std::shared_ptr new_vertical_heap_merge_iterator( + const std::vector& inputs, KeysType key_type, uint32_t seq_col_idx, + RowSourcesBuffer* row_sources_buf); + +std::shared_ptr new_vertical_mask_merge_iterator( + const std::vector& inputs, size_t ori_return_cols, + RowSourcesBuffer* row_sources_buf); + +} // namespace vectorized +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/olap/vgeneric_iterators.cpp b/be/src/vec/olap/vgeneric_iterators.cpp index e64668a8388b80..30cd60641d30f0 100644 --- a/be/src/vec/olap/vgeneric_iterators.cpp +++ b/be/src/vec/olap/vgeneric_iterators.cpp @@ -361,7 +361,6 @@ class VMergeIteratorContext { Status VMergeIteratorContext::init(const StorageReadOptions& opts) { _block_row_max = opts.block_row_max; _record_rowids = opts.record_rowids; - RETURN_IF_ERROR(_iter->init(opts)); RETURN_IF_ERROR(_load_next_block()); if (valid()) { RETURN_IF_ERROR(advance()); diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index a5971dbaf111fd..875eb5d0eb1851 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -363,6 +363,7 @@ set(VEC_TEST_FILES vec/runtime/vdatetime_value_test.cpp vec/utils/arrow_column_to_doris_column_test.cpp vec/olap/char_type_padding_test.cpp + vec/olap/vertical_compaction_test.cpp ) add_executable(doris_be_test diff --git a/be/test/vec/olap/vertical_compaction_test.cpp b/be/test/vec/olap/vertical_compaction_test.cpp new file mode 100644 index 00000000000000..309ea8b80e323d --- /dev/null +++ b/be/test/vec/olap/vertical_compaction_test.cpp @@ -0,0 +1,862 @@ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "olap/merger.h" +#include "olap/rowset/beta_rowset.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_reader.h" +#include "olap/rowset/rowset_reader_context.h" +#include "olap/rowset/rowset_writer.h" +#include "olap/rowset/rowset_writer_context.h" +#include "olap/schema.h" +#include "olap/tablet_schema.h" +#include "olap/tablet_schema_helper.h" +#include "util/file_utils.h" +#include "vec/olap/vertical_block_reader.h" +#include "vec/olap/vertical_merge_iterator.h" + +namespace doris { +namespace vectorized { + +static const uint32_t MAX_PATH_LEN = 1024; +static StorageEngine* k_engine = nullptr; + +class VerticalCompactionTest : public ::testing::Test { +protected: + void SetUp() override { + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + absolute_dir = std::string(buffer) + kTestDir; + + if (FileUtils::check_exist(absolute_dir)) { + EXPECT_TRUE(FileUtils::remove_all(absolute_dir).ok()); + } + EXPECT_TRUE(FileUtils::create_dir(absolute_dir).ok()); + EXPECT_TRUE(FileUtils::create_dir(absolute_dir + "/tablet_path").ok()); + _data_dir = std::make_unique(absolute_dir); + _data_dir->update_capacity(); + doris::EngineOptions options; + k_engine = new StorageEngine(options); + StorageEngine::_s_instance = k_engine; + } + void TearDown() override { + if (FileUtils::check_exist(absolute_dir)) { + EXPECT_TRUE(FileUtils::remove_all(absolute_dir).ok()); + } + if (k_engine != nullptr) { + k_engine->stop(); + delete k_engine; + k_engine = nullptr; + } + } + + TabletSchemaSPtr create_schema(KeysType keys_type = DUP_KEYS) { + TabletSchemaSPtr tablet_schema = std::make_shared(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(keys_type); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(4); + + ColumnPB* column_1 = tablet_schema_pb.add_column(); + column_1->set_unique_id(1); + column_1->set_name("c1"); + column_1->set_type("INT"); + column_1->set_is_key(true); + column_1->set_length(4); + column_1->set_index_length(4); + column_1->set_is_nullable(false); + column_1->set_is_bf_column(false); + + ColumnPB* column_2 = tablet_schema_pb.add_column(); + column_2->set_unique_id(2); + column_2->set_name("c2"); + column_2->set_type("INT"); + column_2->set_length(4); + column_2->set_index_length(4); + column_2->set_is_nullable(true); + column_2->set_is_key(false); + column_2->set_is_nullable(false); + column_2->set_is_bf_column(false); + + // unique table must contains the DELETE_SIGN column + if (keys_type == UNIQUE_KEYS) { + ColumnPB* column_3 = tablet_schema_pb.add_column(); + column_3->set_unique_id(3); + column_3->set_name(DELETE_SIGN); + column_3->set_type("TINYINT"); + column_3->set_length(1); + column_3->set_index_length(1); + column_3->set_is_nullable(false); + column_3->set_is_key(false); + column_3->set_is_nullable(false); + column_3->set_is_bf_column(false); + } + + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + + TabletSchemaSPtr create_agg_schema() { + TabletSchemaSPtr tablet_schema = std::make_shared(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(KeysType::AGG_KEYS); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(4); + + ColumnPB* column_1 = tablet_schema_pb.add_column(); + column_1->set_unique_id(1); + column_1->set_name("c1"); + column_1->set_type("INT"); + column_1->set_is_key(true); + column_1->set_length(4); + column_1->set_index_length(4); + column_1->set_is_nullable(false); + column_1->set_is_bf_column(false); + + ColumnPB* column_2 = tablet_schema_pb.add_column(); + column_2->set_unique_id(2); + column_2->set_name("c2"); + column_2->set_type("INT"); + column_2->set_length(4); + column_2->set_index_length(4); + column_2->set_is_nullable(true); + column_2->set_is_key(false); + column_2->set_is_nullable(false); + column_2->set_is_bf_column(false); + column_2->set_aggregation("SUM"); + + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + + void create_rowset_writer_context(TabletSchemaSPtr tablet_schema, + const SegmentsOverlapPB& overlap, + uint32_t max_rows_per_segment, + RowsetWriterContext* rowset_writer_context) { + static int64_t inc_id = 1000; + RowsetId rowset_id; + rowset_id.init(inc_id); + rowset_writer_context->rowset_id = rowset_id; + rowset_writer_context->rowset_type = VERTICAL_BETA_ROWSET; + rowset_writer_context->data_dir = _data_dir.get(); + rowset_writer_context->rowset_state = VISIBLE; + rowset_writer_context->tablet_schema = tablet_schema; + rowset_writer_context->tablet_path = "tablet_path"; + rowset_writer_context->version = Version(inc_id, inc_id); + rowset_writer_context->segments_overlap = overlap; + rowset_writer_context->max_rows_per_segment = max_rows_per_segment; + inc_id++; + } + + void create_and_init_rowset_reader(Rowset* rowset, RowsetReaderContext& context, + RowsetReaderSharedPtr* result) { + auto s = rowset->create_reader(result); + EXPECT_TRUE(s.ok()); + EXPECT_TRUE(*result != nullptr); + + s = (*result)->init(&context); + EXPECT_TRUE(s.ok()); + } + + RowsetSharedPtr create_rowset( + TabletSchemaSPtr tablet_schema, const SegmentsOverlapPB& overlap, + std::vector>> rowset_data) { + RowsetWriterContext writer_context; + if (overlap == NONOVERLAPPING) { + for (auto i = 1; i < rowset_data.size(); i++) { + auto& last_seg_data = rowset_data[i - 1]; + auto& cur_seg_data = rowset_data[i]; + int64_t last_seg_max = std::get<0>(last_seg_data[last_seg_data.size() - 1]); + int64_t cur_seg_min = std::get<0>(cur_seg_data[0]); + EXPECT_LT(last_seg_max, cur_seg_min); + } + } + create_rowset_writer_context(tablet_schema, overlap, UINT32_MAX, &writer_context); + + std::unique_ptr rowset_writer; + Status s = RowsetFactory::create_rowset_writer(writer_context, &rowset_writer); + EXPECT_TRUE(s.ok()); + + RowCursor input_row; + input_row.init(tablet_schema); + + uint32_t num_rows = 0; + for (int i = 0; i < rowset_data.size(); ++i) { + MemPool mem_pool; + for (int rid = 0; rid < rowset_data[i].size(); ++rid) { + uint32_t c1 = std::get<0>(rowset_data[i][rid]); + uint32_t c2 = std::get<1>(rowset_data[i][rid]); + input_row.set_field_content(0, reinterpret_cast(&c1), &mem_pool); + input_row.set_field_content(1, reinterpret_cast(&c2), &mem_pool); + if (tablet_schema->keys_type() == UNIQUE_KEYS) { + uint8_t num = 0; + input_row.set_field_content(2, reinterpret_cast(&num), &mem_pool); + } + s = rowset_writer->add_row(input_row); + EXPECT_TRUE(s.ok()); + num_rows++; + } + s = rowset_writer->flush(); + EXPECT_TRUE(s.ok()); + } + + RowsetSharedPtr rowset; + rowset = rowset_writer->build(); + EXPECT_TRUE(rowset != nullptr); + EXPECT_EQ(rowset_data.size(), rowset->rowset_meta()->num_segments()); + EXPECT_EQ(num_rows, rowset->rowset_meta()->num_rows()); + return rowset; + } + + void init_rs_meta(RowsetMetaSharedPtr& pb1, int64_t start, int64_t end) { + std::string json_rowset_meta = R"({ + "rowset_id": 540085, + "tablet_id": 15674, + "txn_id": 4045, + "tablet_schema_hash": 567997588, + "rowset_type": "BETA_ROWSET", + "rowset_state": "VISIBLE", + "start_version": 2, + "end_version": 2, + "num_rows": 3929, + "total_disk_size": 84699, + "data_disk_size": 84464, + "index_disk_size": 235, + "empty": false, + "load_id": { + "hi": -5350970832824939812, + "lo": -6717994719194512122 + }, + "creation_time": 1553765670, + "alpha_rowset_extra_meta_pb": { + "segment_groups": [ + { + "segment_group_id": 0, + "num_segments": 2, + "index_size": 132, + "data_size": 576, + "num_rows": 5, + "zone_maps": [ + { + "min": "MQ==", + "max": "NQ==", + "null_flag": false + }, + { + "min": "MQ==", + "max": "Mw==", + "null_flag": false + }, + { + "min": "J2J1c2gn", + "max": "J3RvbSc=", + "null_flag": false + } + ], + "empty": false + }] + } + })"; + pb1->init_from_json(json_rowset_meta); + pb1->set_start_version(start); + pb1->set_end_version(end); + pb1->set_creation_time(10000); + } + + void add_delete_predicate(TabletSharedPtr tablet, DeletePredicatePB& del_pred, + int64_t version) { + RowsetMetaSharedPtr rsm(new RowsetMeta()); + init_rs_meta(rsm, version, version); + RowsetId id; + id.init(version * 1000); + rsm->set_rowset_id(id); + rsm->set_delete_predicate(del_pred); + rsm->set_tablet_schema(tablet->tablet_schema()); + RowsetSharedPtr rowset = std::make_shared(tablet->tablet_schema(), "", rsm); + tablet->add_rowset(rowset); + } + + TabletSharedPtr create_tablet(const TabletSchema& tablet_schema, + bool enable_unique_key_merge_on_write, int64_t version, + bool has_delete_handler) { + std::vector cols; + std::unordered_map col_ordinal_to_unique_id; + for (auto i = 0; i < tablet_schema.num_columns(); i++) { + const TabletColumn& column = tablet_schema.column(i); + TColumn col; + col.column_type.type = TPrimitiveType::INT; + col.__set_column_name(column.name()); + col.__set_is_key(column.is_key()); + cols.push_back(col); + col_ordinal_to_unique_id[i] = column.unique_id(); + } + + TTabletSchema t_tablet_schema; + t_tablet_schema.__set_short_key_column_count(tablet_schema.num_short_key_columns()); + t_tablet_schema.__set_schema_hash(3333); + if (tablet_schema.keys_type() == UNIQUE_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::UNIQUE_KEYS); + } else if (tablet_schema.keys_type() == DUP_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::DUP_KEYS); + } else if (tablet_schema.keys_type() == AGG_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::AGG_KEYS); + } + t_tablet_schema.__set_storage_type(TStorageType::COLUMN); + t_tablet_schema.__set_columns(cols); + TabletMetaSharedPtr tablet_meta( + new TabletMeta(2, 2, 2, 2, 2, 2, t_tablet_schema, 2, col_ordinal_to_unique_id, + UniqueId(1, 2), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F, "", enable_unique_key_merge_on_write)); + + TabletSharedPtr tablet(new Tablet(tablet_meta, nullptr)); + tablet->init(); + if (has_delete_handler) { + // delete data with key < 1000 + std::vector conditions; + TCondition condition; + condition.column_name = tablet_schema.column(0).name(); + condition.condition_op = "<"; + condition.condition_values.clear(); + condition.condition_values.push_back("100"); + conditions.push_back(condition); + + DeletePredicatePB del_pred; + Status st = + DeleteHandler::generate_delete_predicate(tablet_schema, conditions, &del_pred); + EXPECT_EQ(Status::OK(), st); + add_delete_predicate(tablet, del_pred, version); + } + return tablet; + } + + // all rowset's data are same + void generate_input_data( + uint32_t num_input_rowset, uint32_t num_segments, uint32_t rows_per_segment, + const SegmentsOverlapPB& overlap, + std::vector>>>& input_data) { + for (auto i = 0; i < num_input_rowset; i++) { + std::vector>> rowset_data; + for (auto j = 0; j < num_segments; j++) { + std::vector> segment_data; + for (auto n = 0; n < rows_per_segment; n++) { + int64_t c1 = j * rows_per_segment + n; + int64_t c2 = c1 + 1; + segment_data.emplace_back(c1, c2); + } + rowset_data.emplace_back(segment_data); + } + input_data.emplace_back(rowset_data); + } + } + + void block_create(TabletSchemaSPtr tablet_schema, vectorized::Block* block) { + block->clear(); + Schema schema(tablet_schema); + const auto& column_ids = schema.column_ids(); + for (size_t i = 0; i < schema.num_column_ids(); ++i) { + auto column_desc = schema.column(column_ids[i]); + auto data_type = Schema::get_data_type_ptr(*column_desc); + EXPECT_TRUE(data_type != nullptr); + auto column = data_type->create_column(); + block->insert(vectorized::ColumnWithTypeAndName(std::move(column), data_type, + column_desc->name())); + } + } + +private: + const std::string kTestDir = "/ut_dir/vertical_compaction_test"; + string absolute_dir; + std::unique_ptr _data_dir; +}; + +TEST_F(VerticalCompactionTest, TestRowSourcesBuffer) { + RowSourcesBuffer buffer(100, absolute_dir, READER_CUMULATIVE_COMPACTION); + RowSource s1(0, 0); + RowSource s2(0, 0); + RowSource s3(1, 1); + RowSource s4(1, 0); + RowSource s5(2, 0); + RowSource s6(2, 0); + std::vector tmp_row_source; + tmp_row_source.emplace_back(s1); + tmp_row_source.emplace_back(s2); + tmp_row_source.emplace_back(s3); + tmp_row_source.emplace_back(s4); + tmp_row_source.emplace_back(s5); + tmp_row_source.emplace_back(s6); + + EXPECT_TRUE(buffer.append(tmp_row_source).ok()); + EXPECT_EQ(buffer.total_size(), 6); + size_t limit = 10; + buffer.flush(); + buffer.seek_to_begin(); + + int idx = -1; + while (buffer.has_remaining().ok()) { + if (++idx == 1) { + EXPECT_TRUE(buffer.current().agg_flag()); + } + auto cur = buffer.current().get_source_num(); + auto same = buffer.same_source_count(cur, limit); + EXPECT_EQ(same, 2); + buffer.advance(same); + } + + RowSourcesBuffer buffer1(101, absolute_dir, READER_CUMULATIVE_COMPACTION); + EXPECT_TRUE(buffer1.append(tmp_row_source).ok()); + EXPECT_TRUE(buffer1.append(tmp_row_source).ok()); + buffer1.set_agg_flag(2, false); + buffer1.set_agg_flag(4, true); + buffer1.flush(); + buffer1.seek_to_begin(); + EXPECT_EQ(buffer1.total_size(), 12); + idx = -1; + while (buffer1.has_remaining().ok()) { + if (++idx == 1) { + EXPECT_FALSE(buffer1.current().agg_flag()); + } + if (++idx == 0) { + EXPECT_TRUE(buffer1.current().agg_flag()); + } + std::cout << buffer1.buf_idx() << std::endl; + auto cur = buffer1.current().get_source_num(); + auto same = buffer1.same_source_count(cur, limit); + EXPECT_EQ(same, 2); + buffer1.advance(same); + } +} + +TEST_F(VerticalCompactionTest, TestDupKeyVerticalMerge) { + auto num_input_rowset = 2; + auto num_segments = 2; + auto rows_per_segment = 100; + SegmentsOverlapPB overlap = NONOVERLAPPING; + std::vector>>> input_data; + generate_input_data(num_input_rowset, num_segments, rows_per_segment, overlap, input_data); + for (auto rs_id = 0; rs_id < input_data.size(); rs_id++) { + for (auto s_id = 0; s_id < input_data[rs_id].size(); s_id++) { + for (auto row_id = 0; row_id < input_data[rs_id][s_id].size(); row_id++) { + LOG(INFO) << "input data: " << std::get<0>(input_data[rs_id][s_id][row_id]) << " " + << std::get<1>(input_data[rs_id][s_id][row_id]); + } + } + } + + TabletSchemaSPtr tablet_schema = create_schema(); + // create input rowset + vector input_rowsets; + SegmentsOverlapPB new_overlap = overlap; + for (auto i = 0; i < num_input_rowset; i++) { + if (overlap == OVERLAP_UNKNOWN) { + if (i == 0) { + new_overlap = NONOVERLAPPING; + } else { + new_overlap = OVERLAPPING; + } + } + RowsetSharedPtr rowset = create_rowset(tablet_schema, new_overlap, input_data[i]); + input_rowsets.push_back(rowset); + } + // create input rowset reader + vector input_rs_readers; + for (auto& rowset : input_rowsets) { + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowset->create_reader(&rs_reader).ok()); + input_rs_readers.push_back(std::move(rs_reader)); + } + + // create output rowset writer + RowsetWriterContext writer_context; + create_rowset_writer_context(tablet_schema, NONOVERLAPPING, 3456, &writer_context); + std::unique_ptr output_rs_writer; + Status s = RowsetFactory::create_rowset_writer(writer_context, &output_rs_writer); + EXPECT_TRUE(s.ok()); + + // merge input rowset + bool has_delete_handler = false; + TabletSharedPtr tablet = create_tablet( + *tablet_schema, false, output_rs_writer->version().first - 1, has_delete_handler); + Merger::Statistics stats; + RowIdConversion rowid_conversion; + stats.rowid_conversion = &rowid_conversion; + s = Merger::vertical_merge_rowsets(tablet, READER_BASE_COMPACTION, tablet_schema, + input_rs_readers, output_rs_writer.get(), 100, &stats); + EXPECT_TRUE(s.ok()); + RowsetSharedPtr out_rowset = output_rs_writer->build(); + + // create output rowset reader + RowsetReaderContext reader_context; + reader_context.tablet_schema = tablet_schema; + reader_context.need_ordered_result = false; + std::vector return_columns = {0, 1}; + reader_context.return_columns = &return_columns; + reader_context.is_vec = true; + RowsetReaderSharedPtr output_rs_reader; + LOG(INFO) << "create rowset reader in test"; + create_and_init_rowset_reader(out_rowset.get(), reader_context, &output_rs_reader); + + // read output rowset data + vectorized::Block output_block; + std::vector> output_data; + do { + block_create(tablet_schema, &output_block); + s = output_rs_reader->next_block(&output_block); + auto columns = output_block.get_columns_with_type_and_name(); + EXPECT_EQ(columns.size(), 2); + for (auto i = 0; i < output_block.rows(); i++) { + output_data.emplace_back(columns[0].column->get_int(i), columns[1].column->get_int(i)); + } + } while (s == Status::OK()); + EXPECT_EQ(Status::OLAPInternalError(OLAP_ERR_DATA_EOF), s); + EXPECT_EQ(out_rowset->rowset_meta()->num_rows(), output_data.size()); + EXPECT_EQ(output_data.size(), num_input_rowset * num_segments * rows_per_segment); + std::vector segment_num_rows; + EXPECT_TRUE(output_rs_reader->get_segment_num_rows(&segment_num_rows).ok()); + // check vertical compaction result + for (auto id = 0; id < output_data.size(); id++) { + LOG(INFO) << "output data: " << std::get<0>(output_data[id]) << " " + << std::get<1>(output_data[id]); + } + int dst_id = 0; + for (auto rs_id = 0; rs_id < input_data.size(); rs_id++) { + dst_id = 0; + for (auto s_id = 0; s_id < input_data[rs_id].size(); s_id++) { + for (auto row_id = 0; row_id < input_data[rs_id][s_id].size(); row_id++) { + LOG(INFO) << "input data: " << std::get<0>(input_data[rs_id][s_id][row_id]) << " " + << std::get<1>(input_data[rs_id][s_id][row_id]); + EXPECT_EQ(std::get<0>(input_data[rs_id][s_id][row_id]), + std::get<0>(output_data[dst_id])); + EXPECT_EQ(std::get<1>(input_data[rs_id][s_id][row_id]), + std::get<1>(output_data[dst_id])); + dst_id += 2; + } + } + } +} + +TEST_F(VerticalCompactionTest, TestUniqueKeyVerticalMerge) { + auto num_input_rowset = 2; + auto num_segments = 2; + auto rows_per_segment = 100; + SegmentsOverlapPB overlap = NONOVERLAPPING; + std::vector>>> input_data; + generate_input_data(num_input_rowset, num_segments, rows_per_segment, overlap, input_data); + for (auto rs_id = 0; rs_id < input_data.size(); rs_id++) { + for (auto s_id = 0; s_id < input_data[rs_id].size(); s_id++) { + for (auto row_id = 0; row_id < input_data[rs_id][s_id].size(); row_id++) { + LOG(INFO) << "input data: " << std::get<0>(input_data[rs_id][s_id][row_id]) << " " + << std::get<1>(input_data[rs_id][s_id][row_id]); + } + } + } + + TabletSchemaSPtr tablet_schema = create_schema(UNIQUE_KEYS); + // create input rowset + vector input_rowsets; + SegmentsOverlapPB new_overlap = overlap; + for (auto i = 0; i < num_input_rowset; i++) { + if (overlap == OVERLAP_UNKNOWN) { + if (i == 0) { + new_overlap = NONOVERLAPPING; + } else { + new_overlap = OVERLAPPING; + } + } + RowsetSharedPtr rowset = create_rowset(tablet_schema, new_overlap, input_data[i]); + input_rowsets.push_back(rowset); + } + // create input rowset reader + vector input_rs_readers; + for (auto& rowset : input_rowsets) { + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowset->create_reader(&rs_reader).ok()); + input_rs_readers.push_back(std::move(rs_reader)); + } + + // create output rowset writer + RowsetWriterContext writer_context; + create_rowset_writer_context(tablet_schema, NONOVERLAPPING, 3456, &writer_context); + std::unique_ptr output_rs_writer; + Status s = RowsetFactory::create_rowset_writer(writer_context, &output_rs_writer); + EXPECT_TRUE(s.ok()); + + // merge input rowset + bool has_delete_handler = false; + TabletSharedPtr tablet = create_tablet( + *tablet_schema, false, output_rs_writer->version().first - 1, has_delete_handler); + Merger::Statistics stats; + RowIdConversion rowid_conversion; + stats.rowid_conversion = &rowid_conversion; + s = Merger::vertical_merge_rowsets(tablet, READER_BASE_COMPACTION, tablet_schema, + input_rs_readers, output_rs_writer.get(), 100, &stats); + EXPECT_TRUE(s.ok()); + RowsetSharedPtr out_rowset = output_rs_writer->build(); + + // create output rowset reader + RowsetReaderContext reader_context; + reader_context.tablet_schema = tablet_schema; + reader_context.need_ordered_result = false; + std::vector return_columns = {0, 1}; + reader_context.return_columns = &return_columns; + reader_context.is_vec = true; + RowsetReaderSharedPtr output_rs_reader; + LOG(INFO) << "create rowset reader in test"; + create_and_init_rowset_reader(out_rowset.get(), reader_context, &output_rs_reader); + + // read output rowset data + vectorized::Block output_block; + std::vector> output_data; + do { + block_create(tablet_schema, &output_block); + s = output_rs_reader->next_block(&output_block); + auto columns = output_block.get_columns_with_type_and_name(); + EXPECT_EQ(columns.size(), 2); + for (auto i = 0; i < output_block.rows(); i++) { + output_data.emplace_back(columns[0].column->get_int(i), columns[1].column->get_int(i)); + } + } while (s == Status::OK()); + EXPECT_EQ(Status::OLAPInternalError(OLAP_ERR_DATA_EOF), s); + EXPECT_EQ(out_rowset->rowset_meta()->num_rows(), output_data.size()); + EXPECT_EQ(output_data.size(), num_segments * rows_per_segment); + std::vector segment_num_rows; + EXPECT_TRUE(output_rs_reader->get_segment_num_rows(&segment_num_rows).ok()); + // check vertical compaction result + for (auto id = 0; id < output_data.size(); id++) { + LOG(INFO) << "output data: " << std::get<0>(output_data[id]) << " " + << std::get<1>(output_data[id]); + } + int dst_id = 0; + for (auto s_id = 0; s_id < input_data[0].size(); s_id++) { + for (auto row_id = 0; row_id < input_data[0][s_id].size(); row_id++) { + EXPECT_EQ(std::get<0>(input_data[0][s_id][row_id]), std::get<0>(output_data[dst_id])); + EXPECT_EQ(std::get<1>(input_data[0][s_id][row_id]), std::get<1>(output_data[dst_id])); + dst_id++; + } + } +} + +TEST_F(VerticalCompactionTest, TestDupKeyVerticalMergeWithDelete) { + auto num_input_rowset = 2; + auto num_segments = 2; + auto rows_per_segment = 100; + SegmentsOverlapPB overlap = NONOVERLAPPING; + std::vector>>> input_data; + generate_input_data(num_input_rowset, num_segments, rows_per_segment, overlap, input_data); + for (auto rs_id = 0; rs_id < input_data.size(); rs_id++) { + for (auto s_id = 0; s_id < input_data[rs_id].size(); s_id++) { + for (auto row_id = 0; row_id < input_data[rs_id][s_id].size(); row_id++) { + LOG(INFO) << "input data: " << std::get<0>(input_data[rs_id][s_id][row_id]) << " " + << std::get<1>(input_data[rs_id][s_id][row_id]); + } + } + } + + TabletSchemaSPtr tablet_schema = create_schema(DUP_KEYS); + // create input rowset + vector input_rowsets; + SegmentsOverlapPB new_overlap = overlap; + for (auto i = 0; i < num_input_rowset; i++) { + if (overlap == OVERLAP_UNKNOWN) { + if (i == 0) { + new_overlap = NONOVERLAPPING; + } else { + new_overlap = OVERLAPPING; + } + } + RowsetSharedPtr rowset = create_rowset(tablet_schema, new_overlap, input_data[i]); + input_rowsets.push_back(rowset); + } + // create input rowset reader + vector input_rs_readers; + for (auto& rowset : input_rowsets) { + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowset->create_reader(&rs_reader).ok()); + input_rs_readers.push_back(std::move(rs_reader)); + } + + // create output rowset writer + RowsetWriterContext writer_context; + create_rowset_writer_context(tablet_schema, NONOVERLAPPING, 3456, &writer_context); + std::unique_ptr output_rs_writer; + Status s = RowsetFactory::create_rowset_writer(writer_context, &output_rs_writer); + EXPECT_TRUE(s.ok()); + + // merge input rowset + bool has_delete_handler = true; + TabletSharedPtr tablet = create_tablet(*tablet_schema, false, output_rs_writer->version().first, + has_delete_handler); + Merger::Statistics stats; + RowIdConversion rowid_conversion; + stats.rowid_conversion = &rowid_conversion; + s = Merger::vertical_merge_rowsets(tablet, READER_BASE_COMPACTION, tablet_schema, + input_rs_readers, output_rs_writer.get(), 100, &stats); + EXPECT_TRUE(s.ok()); + RowsetSharedPtr out_rowset = output_rs_writer->build(); + + // create output rowset reader + RowsetReaderContext reader_context; + reader_context.tablet_schema = tablet_schema; + reader_context.need_ordered_result = false; + std::vector return_columns = {0, 1}; + reader_context.return_columns = &return_columns; + reader_context.is_vec = true; + RowsetReaderSharedPtr output_rs_reader; + LOG(INFO) << "create rowset reader in test"; + create_and_init_rowset_reader(out_rowset.get(), reader_context, &output_rs_reader); + + // read output rowset data + vectorized::Block output_block; + std::vector> output_data; + do { + block_create(tablet_schema, &output_block); + s = output_rs_reader->next_block(&output_block); + auto columns = output_block.get_columns_with_type_and_name(); + EXPECT_EQ(columns.size(), 2); + for (auto i = 0; i < output_block.rows(); i++) { + output_data.emplace_back(columns[0].column->get_int(i), columns[1].column->get_int(i)); + } + } while (s == Status::OK()); + EXPECT_EQ(Status::OLAPInternalError(OLAP_ERR_DATA_EOF), s); + EXPECT_EQ(out_rowset->rowset_meta()->num_rows(), output_data.size()); + EXPECT_EQ(output_data.size(), + num_input_rowset * num_segments * rows_per_segment - num_input_rowset * 100); + std::vector segment_num_rows; + EXPECT_TRUE(output_rs_reader->get_segment_num_rows(&segment_num_rows).ok()); + if (has_delete_handler) { + // All keys less than 1000 are deleted by delete handler + for (auto& item : output_data) { + EXPECT_GE(std::get<0>(item), 100); + } + } +} + +TEST_F(VerticalCompactionTest, TestAggKeyVerticalMerge) { + auto num_input_rowset = 2; + auto num_segments = 2; + auto rows_per_segment = 100; + SegmentsOverlapPB overlap = NONOVERLAPPING; + std::vector>>> input_data; + generate_input_data(num_input_rowset, num_segments, rows_per_segment, overlap, input_data); + for (auto rs_id = 0; rs_id < input_data.size(); rs_id++) { + for (auto s_id = 0; s_id < input_data[rs_id].size(); s_id++) { + for (auto row_id = 0; row_id < input_data[rs_id][s_id].size(); row_id++) { + LOG(INFO) << "input data: " << std::get<0>(input_data[rs_id][s_id][row_id]) << " " + << std::get<1>(input_data[rs_id][s_id][row_id]); + } + } + } + + TabletSchemaSPtr tablet_schema = create_agg_schema(); + // create input rowset + vector input_rowsets; + SegmentsOverlapPB new_overlap = overlap; + for (auto i = 0; i < num_input_rowset; i++) { + if (overlap == OVERLAP_UNKNOWN) { + if (i == 0) { + new_overlap = NONOVERLAPPING; + } else { + new_overlap = OVERLAPPING; + } + } + RowsetSharedPtr rowset = create_rowset(tablet_schema, new_overlap, input_data[i]); + input_rowsets.push_back(rowset); + } + // create input rowset reader + vector input_rs_readers; + for (auto& rowset : input_rowsets) { + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowset->create_reader(&rs_reader).ok()); + input_rs_readers.push_back(std::move(rs_reader)); + } + + // create output rowset writer + RowsetWriterContext writer_context; + create_rowset_writer_context(tablet_schema, NONOVERLAPPING, 3456, &writer_context); + std::unique_ptr output_rs_writer; + Status s = RowsetFactory::create_rowset_writer(writer_context, &output_rs_writer); + EXPECT_TRUE(s.ok()); + + // merge input rowset + bool has_delete_handler = false; + TabletSharedPtr tablet = create_tablet( + *tablet_schema, false, output_rs_writer->version().first - 1, has_delete_handler); + Merger::Statistics stats; + RowIdConversion rowid_conversion; + stats.rowid_conversion = &rowid_conversion; + s = Merger::vertical_merge_rowsets(tablet, READER_BASE_COMPACTION, tablet_schema, + input_rs_readers, output_rs_writer.get(), 100, &stats); + EXPECT_TRUE(s.ok()); + RowsetSharedPtr out_rowset = output_rs_writer->build(); + + // create output rowset reader + RowsetReaderContext reader_context; + reader_context.tablet_schema = tablet_schema; + reader_context.need_ordered_result = false; + std::vector return_columns = {0, 1}; + reader_context.return_columns = &return_columns; + reader_context.is_vec = true; + RowsetReaderSharedPtr output_rs_reader; + LOG(INFO) << "create rowset reader in test"; + create_and_init_rowset_reader(out_rowset.get(), reader_context, &output_rs_reader); + + // read output rowset data + vectorized::Block output_block; + std::vector> output_data; + do { + block_create(tablet_schema, &output_block); + s = output_rs_reader->next_block(&output_block); + auto columns = output_block.get_columns_with_type_and_name(); + EXPECT_EQ(columns.size(), 2); + for (auto i = 0; i < output_block.rows(); i++) { + output_data.emplace_back(columns[0].column->get_int(i), columns[1].column->get_int(i)); + } + } while (s == Status::OK()); + EXPECT_EQ(Status::OLAPInternalError(OLAP_ERR_DATA_EOF), s); + EXPECT_EQ(out_rowset->rowset_meta()->num_rows(), output_data.size()); + EXPECT_EQ(output_data.size(), num_segments * rows_per_segment); + std::vector segment_num_rows; + EXPECT_TRUE(output_rs_reader->get_segment_num_rows(&segment_num_rows).ok()); + // check vertical compaction result + for (auto id = 0; id < output_data.size(); id++) { + LOG(INFO) << "output data: " << std::get<0>(output_data[id]) << " " + << std::get<1>(output_data[id]); + } + int dst_id = 0; + for (auto s_id = 0; s_id < input_data[0].size(); s_id++) { + for (auto row_id = 0; row_id < input_data[0][s_id].size(); row_id++) { + LOG(INFO) << "input data: " << std::get<0>(input_data[0][s_id][row_id]) << " " + << std::get<1>(input_data[0][s_id][row_id]); + EXPECT_EQ(std::get<0>(input_data[0][s_id][row_id]), std::get<0>(output_data[dst_id])); + EXPECT_EQ(std::get<1>(input_data[0][s_id][row_id]) * 2, + std::get<1>(output_data[dst_id])); + dst_id++; + } + } +} + +} // namespace vectorized +} // namespace doris \ No newline at end of file diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index f19228b9351af6..1d58789abfb3fe 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -34,6 +34,7 @@ message ZoneMap { enum RowsetTypePB { ALPHA_ROWSET = 0; // doris original column storage format BETA_ROWSET = 1; // new column storage format + VERTICAL_BETA_ROWSET = 2; // vertical for compaction } enum RowsetStatePB { diff --git a/regression-test/data/compaction/test_vertical_compaction_agg_keys.out b/regression-test/data/compaction/test_vertical_compaction_agg_keys.out new file mode 100644 index 00000000000000..ffcce1d28bcc06 --- /dev/null +++ b/regression-test/data/compaction/test_vertical_compaction_agg_keys.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.160 2017-10-01T11:11:11.100111 2020-01-02T00:00 2 31 19 \N \N +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.140 2017-10-01T11:11:11.120111 2020-01-03T00:00 2 32 20 \N \N +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N 2020-01-04T00:00 \N 2017-10-01T11:11:11.150111 2020-01-05T00:00 3 34 20 \N \N +4 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 \N \N + +-- !select_default2 -- +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.160 2017-10-01T11:11:11.100111 2020-01-02T00:00 2 31 19 \N \N +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.140 2017-10-01T11:11:11.120111 2020-01-03T00:00 2 32 20 \N \N +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N 2020-01-04T00:00 \N 2017-10-01T11:11:11.150111 2020-01-05T00:00 3 34 20 \N \N +4 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 \N \N + diff --git a/regression-test/data/compaction/test_vertical_compaction_dup_keys.out b/regression-test/data/compaction/test_vertical_compaction_dup_keys.out new file mode 100644 index 00000000000000..1e15e2d8b0b43b --- /dev/null +++ b/regression-test/data/compaction/test_vertical_compaction_dup_keys.out @@ -0,0 +1,21 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-01T00:00 2020-01-01T00:00 2017-10-01T11:11:11.170 2017-10-01T11:11:11.110111 2020-01-01T00:00 1 30 20 +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.160 2017-10-01T11:11:11.100111 2020-01-02T00:00 1 31 19 +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.150 2017-10-01T11:11:11.130111 2020-01-02T00:00 1 31 21 +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.140 2017-10-01T11:11:11.120111 2020-01-03T00:00 1 32 20 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.100 2017-10-01T11:11:11.140111 2020-01-03T00:00 1 32 22 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-04T00:00 2020-01-04T00:00 2017-10-01T11:11:11.110 2017-10-01T11:11:11.150111 2020-01-04T00:00 1 33 21 +4 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 + +-- !select_default2 -- +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-01T00:00 2020-01-01T00:00 2017-10-01T11:11:11.170 2017-10-01T11:11:11.110111 2020-01-01T00:00 1 30 20 +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.160 2017-10-01T11:11:11.100111 2020-01-02T00:00 1 31 19 +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.150 2017-10-01T11:11:11.130111 2020-01-02T00:00 1 31 21 +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.140 2017-10-01T11:11:11.120111 2020-01-03T00:00 1 32 20 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.100 2017-10-01T11:11:11.140111 2020-01-03T00:00 1 32 22 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-04T00:00 2020-01-04T00:00 2017-10-01T11:11:11.110 2017-10-01T11:11:11.150111 2020-01-04T00:00 1 33 21 +4 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 + diff --git a/regression-test/data/compaction/test_vertical_compaction_uniq_keys.out b/regression-test/data/compaction/test_vertical_compaction_uniq_keys.out new file mode 100644 index 00000000000000..0ab6761ce2ff5e --- /dev/null +++ b/regression-test/data/compaction/test_vertical_compaction_uniq_keys.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.160 2017-10-01T11:11:11.100111 2020-01-02T00:00 1 31 19 +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.140 2017-10-01T11:11:11.120111 2020-01-03T00:00 1 32 20 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 +4 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 + +-- !select_default2 -- +1 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-02T00:00 2020-01-02T00:00 2017-10-01T11:11:11.160 2017-10-01T11:11:11.100111 2020-01-02T00:00 1 31 19 +2 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 2020-01-03T00:00 2020-01-03T00:00 2017-10-01T11:11:11.140 2017-10-01T11:11:11.120111 2020-01-03T00:00 1 32 20 +3 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 +4 2017-10-01 2017-10-01 2017-10-01T11:11:11.110 2017-10-01T11:11:11.110111 Beijing 10 1 \N \N \N \N 2020-01-05T00:00 1 34 20 + diff --git a/regression-test/suites/compaction/test_vertical_compaction_agg_keys.groovy b/regression-test/suites/compaction/test_vertical_compaction_agg_keys.groovy new file mode 100644 index 00000000000000..84ce909d2af8ca --- /dev/null +++ b/regression-test/suites/compaction/test_vertical_compaction_agg_keys.groovy @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +suite("test_vertical_compaction_agg_keys") { + def tableName = "vertical_compaction_agg_keys_regression_test" + + def set_be_config = { -> + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + for (String[] backend in backends) { + StringBuilder setConfigCommand = new StringBuilder(); + setConfigCommand.append("curl -X POST http://") + setConfigCommand.append(backend[2]) + setConfigCommand.append(":") + setConfigCommand.append(backend[5]) + setConfigCommand.append("/api/update_config?") + String command1 = setConfigCommand.toString() + "enable_vertical_compaction=true" + logger.info(command1) + def process1 = command1.execute() + int code = process1.waitFor() + assertEquals(code, 0) + } + } + def reset_be_config = { -> + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + for (String[] backend in backends) { + StringBuilder setConfigCommand = new StringBuilder(); + setConfigCommand.append("curl -X POST http://") + setConfigCommand.append(backend[2]) + setConfigCommand.append(":") + setConfigCommand.append(backend[5]) + setConfigCommand.append("/api/update_config?") + String command1 = setConfigCommand.toString() + "enable_vertical_compaction=false" + logger.info(command1) + def process1 = command1.execute() + int code = process1.waitFor() + assertEquals(code, 0) + } + } + + try { + //BackendId,Cluster,IP,HeartbeatPort,BePort,HttpPort,BrpcPort,LastStartTime,LastHeartbeat,Alive,SystemDecommissioned,ClusterDecommissioned,TabletNum,DataUsedCapacity,AvailCapacity,TotalCapacity,UsedPct,MaxDiskUsedPct,Tag,ErrMsg,Version,Status + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + String backend_id; + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + for (String[] backend in backends) { + backendId_to_backendIP.put(backend[0], backend[2]) + backendId_to_backendHttpPort.put(backend[0], backend[5]) + } + + backend_id = backendId_to_backendIP.keySet()[0] + StringBuilder showConfigCommand = new StringBuilder(); + showConfigCommand.append("curl -X GET http://") + showConfigCommand.append(backendId_to_backendIP.get(backend_id)) + showConfigCommand.append(":") + showConfigCommand.append(backendId_to_backendHttpPort.get(backend_id)) + showConfigCommand.append("/api/show_config") + logger.info(showConfigCommand.toString()) + def process = showConfigCommand.toString().execute() + int code = process.waitFor() + String err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + String out = process.getText() + logger.info("Show config: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def configList = parseJson(out.trim()) + assert configList instanceof List + + boolean disableAutoCompaction = true + for (Object ele in (List) configList) { + assert ele instanceof List + if (((List) ele)[0] == "disable_auto_compaction") { + disableAutoCompaction = Boolean.parseBoolean(((List) ele)[2]) + } + } + set_be_config.call() + + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ + CREATE TABLE ${tableName} ( + `user_id` LARGEINT NOT NULL COMMENT "用户id", + `date` DATE NOT NULL COMMENT "数据灌入日期时间", + `datev2` DATEV2 NOT NULL COMMENT "数据灌入日期时间", + `datetimev2_1` DATETIMEV2(3) NOT NULL COMMENT "数据灌入日期时间", + `datetimev2_2` DATETIMEV2(6) NOT NULL COMMENT "数据灌入日期时间", + `city` VARCHAR(20) COMMENT "用户所在城市", + `age` SMALLINT COMMENT "用户年龄", + `sex` TINYINT COMMENT "用户性别", + `last_visit_date` DATETIME REPLACE DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", + `last_update_date` DATETIME REPLACE_IF_NOT_NULL DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次更新时间", + `datetime_val1` DATETIMEV2(3) REPLACE DEFAULT "1970-01-01 00:00:00.111" COMMENT "用户最后一次访问时间", + `datetime_val2` DATETIME(6) REPLACE_IF_NOT_NULL DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次更新时间", + `last_visit_date_not_null` DATETIME REPLACE NOT NULL DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", + `cost` BIGINT SUM DEFAULT "0" COMMENT "用户总消费", + `max_dwell_time` INT MAX DEFAULT "0" COMMENT "用户最大停留时间", + `min_dwell_time` INT MIN DEFAULT "99999" COMMENT "用户最小停留时间", + `hll_col` HLL HLL_UNION NOT NULL COMMENT "HLL列", + `bitmap_col` Bitmap BITMAP_UNION NOT NULL COMMENT "bitmap列" ) + AGGREGATE KEY(`user_id`, `date`, `datev2`, `datetimev2_1`, `datetimev2_2`, `city`, `age`, `sex`) DISTRIBUTED BY HASH(`user_id`) BUCKETS 10 + PROPERTIES ( "replication_num" = "1" ); + """ + + sql """ INSERT INTO ${tableName} VALUES + (1, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-01', '2020-01-01', '2017-10-01 11:11:11.170000', '2017-10-01 11:11:11.110111', '2020-01-01', 1, 30, 20, hll_hash(1), to_bitmap(1)) + """ + + sql """ INSERT INTO ${tableName} VALUES + (1, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-02', '2020-01-02', '2017-10-01 11:11:11.160000', '2017-10-01 11:11:11.100111', '2020-01-02', 1, 31, 19, hll_hash(2), to_bitmap(2)) + """ + + sql """ INSERT INTO ${tableName} VALUES + (2, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-02', '2020-01-02', '2017-10-01 11:11:11.150000', '2017-10-01 11:11:11.130111', '2020-01-02', 1, 31, 21, hll_hash(2), to_bitmap(2)) + """ + + sql """ INSERT INTO ${tableName} VALUES + (2, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-03', '2020-01-03', '2017-10-01 11:11:11.140000', '2017-10-01 11:11:11.120111', '2020-01-03', 1, 32, 20, hll_hash(3), to_bitmap(3)) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-03', '2020-01-03', '2017-10-01 11:11:11.100000', '2017-10-01 11:11:11.140111', '2020-01-03', 1, 32, 22, hll_hash(3), to_bitmap(3)) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-04', '2020-01-04', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.150111', '2020-01-04', 1, 33, 21, hll_hash(4), to_bitmap(4)) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, NULL, NULL, NULL, NULL, '2020-01-05', 1, 34, 20, hll_hash(5), to_bitmap(5)) + """ + + sql """ INSERT INTO ${tableName} VALUES + (4, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, NULL, NULL, NULL, NULL, '2020-01-05', 1, 34, 20, hll_hash(5), to_bitmap(5)) + """ + + qt_select_default """ SELECT * FROM ${tableName} t ORDER BY user_id; """ + + //TabletId,ReplicaId,BackendId,SchemaHash,Version,LstSuccessVersion,LstFailedVersion,LstFailedTime,LocalDataSize,RemoteDataSize,RowCount,State,LstConsistencyCheckTime,CheckVersion,VersionCount,PathHash,MetaUrl,CompactionStatus + String[][] tablets = sql """ show tablets from ${tableName}; """ + + // trigger compactions for all tablets in ${tableName} + for (String[] tablet in tablets) { + String tablet_id = tablet[0] + backend_id = tablet[2] + StringBuilder sb = new StringBuilder(); + sb.append("curl -X POST http://") + sb.append(backendId_to_backendIP.get(backend_id)) + sb.append(":") + sb.append(backendId_to_backendHttpPort.get(backend_id)) + sb.append("/api/compaction/run?tablet_id=") + sb.append(tablet_id) + sb.append("&compact_type=cumulative") + + String command = sb.toString() + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Run compaction: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def compactJson = parseJson(out.trim()) + if (compactJson.status.toLowerCase() == "fail") { + assertEquals(disableAutoCompaction, false) + logger.info("Compaction was done automatically!") + } + if (disableAutoCompaction) { + assertEquals("success", compactJson.status.toLowerCase()) + } + } + + // wait for all compactions done + for (String[] tablet in tablets) { + boolean running = true + do { + Thread.sleep(1000) + String tablet_id = tablet[0] + backend_id = tablet[2] + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://") + sb.append(backendId_to_backendIP.get(backend_id)) + sb.append(":") + sb.append(backendId_to_backendHttpPort.get(backend_id)) + sb.append("/api/compaction/run_status?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Get compaction status: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def compactionStatus = parseJson(out.trim()) + assertEquals("success", compactionStatus.status.toLowerCase()) + running = compactionStatus.run_status + } while (running) + } + + int rowCount = 0 + for (String[] tablet in tablets) { + String tablet_id = tablet[0] + StringBuilder sb = new StringBuilder(); + def compactionStatusUrlIndex = 17 + sb.append("curl -X GET ") + sb.append(tablet[compactionStatusUrlIndex]) + String command = sb.toString() + // wait for cleaning stale_rowsets + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Show tablets status: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def tabletJson = parseJson(out.trim()) + assert tabletJson.rowsets instanceof List + for (String rowset in (List) tabletJson.rowsets) { + rowCount += Integer.parseInt(rowset.split(" ")[1]) + } + } + assert (rowCount < 8) + qt_select_default2 """ SELECT * FROM ${tableName} t ORDER BY user_id; """ + } finally { + try_sql("DROP TABLE IF EXISTS ${tableName}") + reset_be_config.call() + } +} diff --git a/regression-test/suites/compaction/test_vertical_compaction_dup_keys.groovy b/regression-test/suites/compaction/test_vertical_compaction_dup_keys.groovy new file mode 100644 index 00000000000000..ae0f1cdbbcae50 --- /dev/null +++ b/regression-test/suites/compaction/test_vertical_compaction_dup_keys.groovy @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +suite("test_vertical_compaction_dup_keys") { + def tableName = "vertical_compaction_dup_keys_regression_test" + + def set_be_config = { -> + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + for (String[] backend in backends) { + StringBuilder setConfigCommand = new StringBuilder(); + setConfigCommand.append("curl -X POST http://") + setConfigCommand.append(backend[2]) + setConfigCommand.append(":") + setConfigCommand.append(backend[5]) + setConfigCommand.append("/api/update_config?") + String command1 = setConfigCommand.toString() + "enable_vertical_compaction=true" + logger.info(command1) + def process1 = command1.execute() + int code = process1.waitFor() + assertEquals(code, 0) + } + } + def reset_be_config = { -> + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + for (String[] backend in backends) { + StringBuilder setConfigCommand = new StringBuilder(); + setConfigCommand.append("curl -X POST http://") + setConfigCommand.append(backend[2]) + setConfigCommand.append(":") + setConfigCommand.append(backend[5]) + setConfigCommand.append("/api/update_config?") + String command1 = setConfigCommand.toString() + "enable_vertical_compaction=false" + logger.info(command1) + def process1 = command1.execute() + int code = process1.waitFor() + assertEquals(code, 0) + } + } + + try { + //BackendId,Cluster,IP,HeartbeatPort,BePort,HttpPort,BrpcPort,LastStartTime,LastHeartbeat,Alive,SystemDecommissioned,ClusterDecommissioned,TabletNum,DataUsedCapacity,AvailCapacity,TotalCapacity,UsedPct,MaxDiskUsedPct,Tag,ErrMsg,Version,Status + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + String backend_id; + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + for (String[] backend in backends) { + backendId_to_backendIP.put(backend[0], backend[2]) + backendId_to_backendHttpPort.put(backend[0], backend[5]) + } + + backend_id = backendId_to_backendIP.keySet()[0] + StringBuilder showConfigCommand = new StringBuilder(); + showConfigCommand.append("curl -X GET http://") + showConfigCommand.append(backendId_to_backendIP.get(backend_id)) + showConfigCommand.append(":") + showConfigCommand.append(backendId_to_backendHttpPort.get(backend_id)) + showConfigCommand.append("/api/show_config") + logger.info(showConfigCommand.toString()) + def process = showConfigCommand.toString().execute() + int code = process.waitFor() + String err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + String out = process.getText() + logger.info("Show config: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def configList = parseJson(out.trim()) + assert configList instanceof List + + boolean disableAutoCompaction = true + for (Object ele in (List) configList) { + assert ele instanceof List + if (((List) ele)[0] == "disable_auto_compaction") { + disableAutoCompaction = Boolean.parseBoolean(((List) ele)[2]) + } + } + set_be_config.call() + + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ + CREATE TABLE ${tableName} ( + `user_id` LARGEINT NOT NULL COMMENT "用户id", + `date` DATE NOT NULL COMMENT "数据灌入日期时间", + `datev2` DATEV2 NOT NULL COMMENT "数据灌入日期时间", + `datetimev2_1` DATETIMEV2(3) NOT NULL COMMENT "数据灌入日期时间", + `datetimev2_2` DATETIMEV2(6) NOT NULL COMMENT "数据灌入日期时间", + `city` VARCHAR(20) COMMENT "用户所在城市", + `age` SMALLINT COMMENT "用户年龄", + `sex` TINYINT COMMENT "用户性别", + `last_visit_date` DATETIME DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", + `last_update_date` DATETIME DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次更新时间", + `datetime_val1` DATETIMEV2(3) DEFAULT "1970-01-01 00:00:00.111" COMMENT "用户最后一次访问时间", + `datetime_val2` DATETIME(6) DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次更新时间", + `last_visit_date_not_null` DATETIME NOT NULL DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", + `cost` BIGINT DEFAULT "0" COMMENT "用户总消费", + `max_dwell_time` INT DEFAULT "0" COMMENT "用户最大停留时间", + `min_dwell_time` INT DEFAULT "99999" COMMENT "用户最小停留时间") + DUPLICATE KEY(`user_id`, `date`, `datev2`, `datetimev2_1`, `datetimev2_2`, `city`, `age`, `sex`) DISTRIBUTED BY HASH(`user_id`) + PROPERTIES ( "replication_num" = "1" ); + """ + + sql """ INSERT INTO ${tableName} VALUES + (1, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-01', '2020-01-01', '2017-10-01 11:11:11.170000', '2017-10-01 11:11:11.110111', '2020-01-01', 1, 30, 20) + """ + + sql """ INSERT INTO ${tableName} VALUES + (1, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-02', '2020-01-02', '2017-10-01 11:11:11.160000', '2017-10-01 11:11:11.100111', '2020-01-02', 1, 31, 19) + """ + + sql """ INSERT INTO ${tableName} VALUES + (2, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-02', '2020-01-02', '2017-10-01 11:11:11.150000', '2017-10-01 11:11:11.130111', '2020-01-02', 1, 31, 21) + """ + + sql """ INSERT INTO ${tableName} VALUES + (2, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-03', '2020-01-03', '2017-10-01 11:11:11.140000', '2017-10-01 11:11:11.120111', '2020-01-03', 1, 32, 20) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-03', '2020-01-03', '2017-10-01 11:11:11.100000', '2017-10-01 11:11:11.140111', '2020-01-03', 1, 32, 22) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-04', '2020-01-04', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.150111', '2020-01-04', 1, 33, 21) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, NULL, NULL, NULL, NULL, '2020-01-05', 1, 34, 20) + """ + + sql """ INSERT INTO ${tableName} VALUES + (4, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, NULL, NULL, NULL, NULL, '2020-01-05', 1, 34, 20) + """ + + qt_select_default """ SELECT * FROM ${tableName} t ORDER BY user_id,date,city,age,sex,last_visit_date,last_update_date,last_visit_date_not_null,cost,max_dwell_time,min_dwell_time; """ + + //TabletId,ReplicaId,BackendId,SchemaHash,Version,LstSuccessVersion,LstFailedVersion,LstFailedTime,LocalDataSize,RemoteDataSize,RowCount,State,LstConsistencyCheckTime,CheckVersion,VersionCount,PathHash,MetaUrl,CompactionStatus + String[][] tablets = sql """ show tablets from ${tableName}; """ + + // trigger compactions for all tablets in ${tableName} + for (String[] tablet in tablets) { + String tablet_id = tablet[0] + backend_id = tablet[2] + StringBuilder sb = new StringBuilder(); + sb.append("curl -X POST http://") + sb.append(backendId_to_backendIP.get(backend_id)) + sb.append(":") + sb.append(backendId_to_backendHttpPort.get(backend_id)) + sb.append("/api/compaction/run?tablet_id=") + sb.append(tablet_id) + sb.append("&compact_type=cumulative") + + String command = sb.toString() + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Run compaction: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def compactJson = parseJson(out.trim()) + if (compactJson.status.toLowerCase() == "fail") { + assertEquals(disableAutoCompaction, false) + logger.info("Compaction was done automatically!") + } + if (disableAutoCompaction) { + assertEquals("success", compactJson.status.toLowerCase()) + } + } + + // wait for all compactions done + for (String[] tablet in tablets) { + boolean running = true + do { + Thread.sleep(1000) + String tablet_id = tablet[0] + backend_id = tablet[2] + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://") + sb.append(backendId_to_backendIP.get(backend_id)) + sb.append(":") + sb.append(backendId_to_backendHttpPort.get(backend_id)) + sb.append("/api/compaction/run_status?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Get compaction status: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def compactionStatus = parseJson(out.trim()) + assertEquals("success", compactionStatus.status.toLowerCase()) + running = compactionStatus.run_status + } while (running) + } + + int rowCount = 0 + for (String[] tablet in tablets) { + String tablet_id = tablet[0] + StringBuilder sb = new StringBuilder(); + def compactionStatusUrlIndex = 17 + sb.append("curl -X GET ") + sb.append(tablet[compactionStatusUrlIndex]) + String command = sb.toString() + // wait for cleaning stale_rowsets + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Show tablets status: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def tabletJson = parseJson(out.trim()) + assert tabletJson.rowsets instanceof List + for (String rowset in (List) tabletJson.rowsets) { + rowCount += Integer.parseInt(rowset.split(" ")[1]) + } + } + assert (rowCount <= 8) + qt_select_default2 """ SELECT * FROM ${tableName} t ORDER BY user_id,date,city,age,sex,last_visit_date,last_update_date,last_visit_date_not_null,cost,max_dwell_time,min_dwell_time; """ + } finally { + try_sql("DROP TABLE IF EXISTS ${tableName}") + reset_be_config.call() + } +} diff --git a/regression-test/suites/compaction/test_vertical_compaction_uniq_keys.groovy b/regression-test/suites/compaction/test_vertical_compaction_uniq_keys.groovy new file mode 100644 index 00000000000000..86b9c56f2d9f5b --- /dev/null +++ b/regression-test/suites/compaction/test_vertical_compaction_uniq_keys.groovy @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +suite("test_vertical_compaction_uniq_keys") { + def tableName = "vertical_compaction_uniq_keys_regression_test" + + def set_be_config = { -> + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + for (String[] backend in backends) { + StringBuilder setConfigCommand = new StringBuilder(); + setConfigCommand.append("curl -X POST http://") + setConfigCommand.append(backend[2]) + setConfigCommand.append(":") + setConfigCommand.append(backend[5]) + setConfigCommand.append("/api/update_config?") + String command1 = setConfigCommand.toString() + "enable_vertical_compaction=true" + logger.info(command1) + def process1 = command1.execute() + int code = process1.waitFor() + assertEquals(code, 0) + } + } + def reset_be_config = { -> + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + for (String[] backend in backends) { + StringBuilder setConfigCommand = new StringBuilder(); + setConfigCommand.append("curl -X POST http://") + setConfigCommand.append(backend[2]) + setConfigCommand.append(":") + setConfigCommand.append(backend[5]) + setConfigCommand.append("/api/update_config?") + String command1 = setConfigCommand.toString() + "enable_vertical_compaction=false" + logger.info(command1) + def process1 = command1.execute() + int code = process1.waitFor() + assertEquals(code, 0) + } + } + + try { + //BackendId,Cluster,IP,HeartbeatPort,BePort,HttpPort,BrpcPort,LastStartTime,LastHeartbeat,Alive,SystemDecommissioned,ClusterDecommissioned,TabletNum,DataUsedCapacity,AvailCapacity,TotalCapacity,UsedPct,MaxDiskUsedPct,Tag,ErrMsg,Version,Status + String[][] backends = sql """ show backends; """ + assertTrue(backends.size() > 0) + String backend_id; + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + for (String[] backend in backends) { + backendId_to_backendIP.put(backend[0], backend[2]) + backendId_to_backendHttpPort.put(backend[0], backend[5]) + } + + backend_id = backendId_to_backendIP.keySet()[0] + StringBuilder showConfigCommand = new StringBuilder(); + showConfigCommand.append("curl -X GET http://") + showConfigCommand.append(backendId_to_backendIP.get(backend_id)) + showConfigCommand.append(":") + showConfigCommand.append(backendId_to_backendHttpPort.get(backend_id)) + showConfigCommand.append("/api/show_config") + logger.info(showConfigCommand.toString()) + def process = showConfigCommand.toString().execute() + int code = process.waitFor() + String err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + String out = process.getText() + logger.info("Show config: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def configList = parseJson(out.trim()) + assert configList instanceof List + + boolean disableAutoCompaction = true + for (Object ele in (List) configList) { + assert ele instanceof List + if (((List) ele)[0] == "disable_auto_compaction") { + disableAutoCompaction = Boolean.parseBoolean(((List) ele)[2]) + } + } + set_be_config.call() + + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ + CREATE TABLE ${tableName} ( + `user_id` LARGEINT NOT NULL COMMENT "用户id", + `date` DATE NOT NULL COMMENT "数据灌入日期时间", + `datev2` DATEV2 NOT NULL COMMENT "数据灌入日期时间", + `datetimev2_1` DATETIMEV2(3) NOT NULL COMMENT "数据灌入日期时间", + `datetimev2_2` DATETIMEV2(6) NOT NULL COMMENT "数据灌入日期时间", + `city` VARCHAR(20) COMMENT "用户所在城市", + `age` SMALLINT COMMENT "用户年龄", + `sex` TINYINT COMMENT "用户性别", + `last_visit_date` DATETIME DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", + `last_update_date` DATETIME DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次更新时间", + `datetime_val1` DATETIMEV2(3) DEFAULT "1970-01-01 00:00:00.111" COMMENT "用户最后一次访问时间", + `datetime_val2` DATETIME(6) DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次更新时间", + `last_visit_date_not_null` DATETIME NOT NULL DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", + `cost` BIGINT DEFAULT "0" COMMENT "用户总消费", + `max_dwell_time` INT DEFAULT "0" COMMENT "用户最大停留时间", + `min_dwell_time` INT DEFAULT "99999" COMMENT "用户最小停留时间") + UNIQUE KEY(`user_id`, `date`, `datev2`, `datetimev2_1`, `datetimev2_2`, `city`, `age`, `sex`) DISTRIBUTED BY HASH(`user_id`) + PROPERTIES ( "replication_num" = "1" ); + """ + + sql """ INSERT INTO ${tableName} VALUES + (1, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-01', '2020-01-01', '2017-10-01 11:11:11.170000', '2017-10-01 11:11:11.110111', '2020-01-01', 1, 30, 20) + """ + + sql """ INSERT INTO ${tableName} VALUES + (1, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-02', '2020-01-02', '2017-10-01 11:11:11.160000', '2017-10-01 11:11:11.100111', '2020-01-02', 1, 31, 19) + """ + + sql """ INSERT INTO ${tableName} VALUES + (2, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-02', '2020-01-02', '2017-10-01 11:11:11.150000', '2017-10-01 11:11:11.130111', '2020-01-02', 1, 31, 21) + """ + + sql """ INSERT INTO ${tableName} VALUES + (2, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-03', '2020-01-03', '2017-10-01 11:11:11.140000', '2017-10-01 11:11:11.120111', '2020-01-03', 1, 32, 20) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-03', '2020-01-03', '2017-10-01 11:11:11.100000', '2017-10-01 11:11:11.140111', '2020-01-03', 1, 32, 22) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, '2020-01-04', '2020-01-04', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.150111', '2020-01-04', 1, 33, 21) + """ + + sql """ INSERT INTO ${tableName} VALUES + (3, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, NULL, NULL, NULL, NULL, '2020-01-05', 1, 34, 20) + """ + + sql """ INSERT INTO ${tableName} VALUES + (4, '2017-10-01', '2017-10-01', '2017-10-01 11:11:11.110000', '2017-10-01 11:11:11.110111', 'Beijing', 10, 1, NULL, NULL, NULL, NULL, '2020-01-05', 1, 34, 20) + """ + + qt_select_default """ SELECT * FROM ${tableName} t ORDER BY user_id; """ + + //TabletId,ReplicaId,BackendId,SchemaHash,Version,LstSuccessVersion,LstFailedVersion,LstFailedTime,LocalDataSize,RemoteDataSize,RowCount,State,LstConsistencyCheckTime,CheckVersion,VersionCount,PathHash,MetaUrl,CompactionStatus + String[][] tablets = sql """ show tablets from ${tableName}; """ + + // trigger compactions for all tablets in ${tableName} + for (String[] tablet in tablets) { + String tablet_id = tablet[0] + backend_id = tablet[2] + StringBuilder sb = new StringBuilder(); + sb.append("curl -X POST http://") + sb.append(backendId_to_backendIP.get(backend_id)) + sb.append(":") + sb.append(backendId_to_backendHttpPort.get(backend_id)) + sb.append("/api/compaction/run?tablet_id=") + sb.append(tablet_id) + sb.append("&compact_type=cumulative") + + String command = sb.toString() + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Run compaction: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def compactJson = parseJson(out.trim()) + if (compactJson.status.toLowerCase() == "fail") { + assertEquals(disableAutoCompaction, false) + logger.info("Compaction was done automatically!") + } + if (disableAutoCompaction) { + assertEquals("success", compactJson.status.toLowerCase()) + } + } + + // wait for all compactions done + for (String[] tablet in tablets) { + boolean running = true + do { + Thread.sleep(1000) + String tablet_id = tablet[0] + backend_id = tablet[2] + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://") + sb.append(backendId_to_backendIP.get(backend_id)) + sb.append(":") + sb.append(backendId_to_backendHttpPort.get(backend_id)) + sb.append("/api/compaction/run_status?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Get compaction status: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def compactionStatus = parseJson(out.trim()) + assertEquals("success", compactionStatus.status.toLowerCase()) + running = compactionStatus.run_status + } while (running) + } + + int rowCount = 0 + for (String[] tablet in tablets) { + String tablet_id = tablet[0] + StringBuilder sb = new StringBuilder(); + def compactionStatusUrlIndex = 17 + sb.append("curl -X GET ") + sb.append(tablet[compactionStatusUrlIndex]) + String command = sb.toString() + // wait for cleaning stale_rowsets + process = command.execute() + code = process.waitFor() + err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream()))); + out = process.getText() + logger.info("Show tablets status: code=" + code + ", out=" + out + ", err=" + err) + assertEquals(code, 0) + def tabletJson = parseJson(out.trim()) + assert tabletJson.rowsets instanceof List + for (String rowset in (List) tabletJson.rowsets) { + rowCount += Integer.parseInt(rowset.split(" ")[1]) + } + } + assert (rowCount < 8) + qt_select_default2 """ SELECT * FROM ${tableName} t ORDER BY user_id; """ + } finally { + try_sql("DROP TABLE IF EXISTS ${tableName}") + reset_be_config.call() + } +}