From 933bb77479489db1cb0286ec2d91b3c9cf11fb13 Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Mon, 7 Mar 2022 10:20:20 +0800 Subject: [PATCH 01/32] Add tablet_writer_add_block into internal_service.proto --- gensrc/proto/internal_service.proto | 32 +++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index 4d73bcb701bd0d..80c41b22614e25 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -104,6 +104,28 @@ message PTabletWriterAddBatchRequest { optional bool is_high_priority = 11 [default = false]; }; +message PTabletWriterAddBlockRequest { + required PUniqueId id = 1; + required int64 index_id = 2; + required int32 sender_id = 3; + + // If this is the last batch from this sender + optional bool eos = 4; + + required int64 packet_seq = 5; + repeated int64 tablet_ids = 6; + // unset if and only if when eos is true + optional PBlock block = 7; + // only valid when eos is true + // valid partition ids that would write in this writer + repeated int64 partition_ids = 8; + // the backend which send this request + optional int64 backend_id = 9 [default = -1]; + // transfer the vectorized::Block to the Controller Attachment + optional bool transfer_by_attachment = 10 [default = false]; + optional bool is_high_priority = 11 [default = false]; +}; + message PTabletError { optional int64 tablet_id = 1; optional string msg = 2; @@ -118,6 +140,15 @@ message PTabletWriterAddBatchResult { repeated PTabletError tablet_errors = 6; }; +message PTabletWriterAddBlockResult { + required PStatus status = 1; + repeated PTabletInfo tablet_vec = 2; + optional int64 execution_time_us = 3; + optional int64 wait_lock_time_us = 4; + optional int64 wait_execution_time_us = 5; + repeated PTabletError tablet_errors = 6; +}; + // tablet writer cancel message PTabletWriterCancelRequest { required PUniqueId id = 1; @@ -428,6 +459,7 @@ service PBackendService { rpc fetch_data(PFetchDataRequest) returns (PFetchDataResult); rpc tablet_writer_open(PTabletWriterOpenRequest) returns (PTabletWriterOpenResult); rpc tablet_writer_add_batch(PTabletWriterAddBatchRequest) returns (PTabletWriterAddBatchResult); + rpc tablet_writer_add_block(PTabletWriterAddBlockRequest) returns (PTabletWriterAddBlockResult); rpc tablet_writer_cancel(PTabletWriterCancelRequest) returns (PTabletWriterCancelResult); rpc get_info(PProxyRequest) returns (PProxyResult); rpc update_cache(PUpdateCacheRequest) returns (PCacheResponse); From 0cfb44586f0a8a23f647929146b6f6c57b56b8dd Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Wed, 9 Mar 2022 10:53:33 +0800 Subject: [PATCH 02/32] Add VLoadChannelMgr/VLoadChannel/VTabletsChannel/VDeltaWriter --- be/src/olap/delta_writer.h | 8 +- be/src/olap/memtable.h | 2 + be/src/runtime/load_channel.cpp | 49 ++++---- be/src/runtime/load_channel.h | 34 +++++- be/src/runtime/load_channel_mgr.cpp | 50 ++++---- be/src/runtime/load_channel_mgr.h | 36 +++++- be/src/runtime/tablets_channel.cpp | 30 ++--- be/src/runtime/tablets_channel.h | 34 +++++- be/src/vec/CMakeLists.txt | 6 +- be/src/vec/olap/vdelta_writer.cpp | 78 ++++++++++++ be/src/vec/olap/vdelta_writer.h | 42 +++++++ be/src/vec/runtime/vload_channel.cpp | 88 ++++++++++++++ be/src/vec/runtime/vload_channel.h | 43 +++++++ be/src/vec/runtime/vload_channel_mgr.cpp | 69 +++++++++++ be/src/vec/runtime/vload_channel_mgr.h | 44 +++++++ be/src/vec/runtime/vtablets_channel.cpp | 144 +++++++++++++++++++++++ be/src/vec/runtime/vtablets_channel.h | 40 +++++++ 17 files changed, 717 insertions(+), 80 deletions(-) create mode 100644 be/src/vec/olap/vdelta_writer.cpp create mode 100644 be/src/vec/olap/vdelta_writer.h create mode 100644 be/src/vec/runtime/vload_channel.cpp create mode 100644 be/src/vec/runtime/vload_channel.h create mode 100644 be/src/vec/runtime/vload_channel_mgr.cpp create mode 100644 be/src/vec/runtime/vload_channel_mgr.h create mode 100644 be/src/vec/runtime/vtablets_channel.cpp create mode 100644 be/src/vec/runtime/vtablets_channel.h diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index c46dad3f85d9c9..f06157ea63c61e 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -56,12 +56,16 @@ class DeltaWriter { public: static OLAPStatus open(WriteRequest* req, DeltaWriter** writer); - ~DeltaWriter(); + virtual ~DeltaWriter(); OLAPStatus init(); OLAPStatus write(Tuple* tuple); OLAPStatus write(const RowBatch* row_batch, const std::vector& row_idxs); + virtual OLAPStatus write(const vectorized::Block* block, const std::vector& row_idxs) { + return OLAP_ERR_READER_INITIALIZE_ERROR; + } + // flush the last memtable to flush queue, must call it before close_wait() OLAPStatus close(); // wait for all memtables to be flushed. @@ -98,7 +102,7 @@ class DeltaWriter { void _reset_mem_table(); -private: +protected: bool _is_init = false; bool _is_cancelled = false; WriteRequest _req; diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index db0e6a12cf1b64..85b709b0d65c5e 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -25,6 +25,7 @@ #include "olap/skiplist.h" #include "runtime/mem_tracker.h" #include "util/tuple_row_zorder_compare.h" +#include "vec/core/block.h" namespace doris { @@ -47,6 +48,7 @@ class MemTable { size_t memory_usage() const { return _mem_tracker->consumption(); } std::shared_ptr mem_tracker() { return _mem_tracker; } void insert(const Tuple* tuple); + void insert(const vectorized::Block* block, size_t row_pos, size_t& num_rows); /// Flush OLAPStatus flush(); OLAPStatus close(); diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp index 95a50b5dede627..0d7633de40aa48 100644 --- a/be/src/runtime/load_channel.cpp +++ b/be/src/runtime/load_channel.cpp @@ -19,7 +19,6 @@ #include "olap/lru_cache.h" #include "runtime/mem_tracker.h" -#include "runtime/tablets_channel.h" namespace doris { @@ -64,24 +63,34 @@ Status LoadChannel::open(const PTabletWriterOpenRequest& params) { return Status::OK(); } +Status LoadChannel::_get_tablets_channel(std::shared_ptr& channel, bool& is_finished, const int64_t index_id) { + std::lock_guard l(_lock); + auto it = _tablets_channels.find(index_id); + if (it == _tablets_channels.end()) { + if (_finished_channel_ids.find(index_id) != _finished_channel_ids.end()) { + // this channel is already finished, just return OK + is_finished = true; + return Status::OK(); + } + std::stringstream ss; + ss << "load channel " << _load_id << " add batch with unknown index id: " << index_id; + return Status::InternalError(ss.str()); + } + + is_finished = false; + channel = it->second; + return Status::OK(); +} + Status LoadChannel::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { int64_t index_id = request.index_id(); // 1. get tablets channel std::shared_ptr channel; - { - std::lock_guard l(_lock); - auto it = _tablets_channels.find(index_id); - if (it == _tablets_channels.end()) { - if (_finished_channel_ids.find(index_id) != _finished_channel_ids.end()) { - // this channel is already finished, just return OK - return Status::OK(); - } - std::stringstream ss; - ss << "load channel " << _load_id << " add batch with unknown index id: " << index_id; - return Status::InternalError(ss.str()); - } - channel = it->second; + bool is_finished; + Status st = _get_tablets_channel(channel, is_finished, index_id); + if (!st.ok() || is_finished) { + return st; } // 2. check if mem consumption exceed limit @@ -93,16 +102,10 @@ Status LoadChannel::add_batch(const PTabletWriterAddBatchRequest& request, } // 4. handle eos - Status st; if (request.has_eos() && request.eos()) { - bool finished = false; - RETURN_IF_ERROR(channel->close(request.sender_id(), request.backend_id(), - &finished, request.partition_ids(), - response->mutable_tablet_vec())); - if (finished) { - std::lock_guard l(_lock); - _tablets_channels.erase(index_id); - _finished_channel_ids.emplace(index_id); + st = _handle_eos(channel, request, response); + if (!st.ok()) { + return st; } } _last_updated_time.store(time(nullptr)); diff --git a/be/src/runtime/load_channel.h b/be/src/runtime/load_channel.h index ba0ad3033498c8..c2ddeaff368869 100644 --- a/be/src/runtime/load_channel.h +++ b/be/src/runtime/load_channel.h @@ -27,12 +27,12 @@ #include "gen_cpp/Types_types.h" #include "gen_cpp/internal_service.pb.h" #include "runtime/mem_tracker.h" +#include "runtime/tablets_channel.h" #include "util/uid_util.h" namespace doris { class Cache; -class TabletsChannel; // A LoadChannel manages tablets channels for all indexes // corresponding to a certain load job @@ -43,12 +43,17 @@ class LoadChannel { ~LoadChannel(); // open a new load channel if not exist - Status open(const PTabletWriterOpenRequest& request); + virtual Status open(const PTabletWriterOpenRequest& request); // this batch must belong to a index in one transaction Status add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response); + virtual Status add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) { + return Status::NotSupported("Not Implemented add_block"); + } + // return true if this load channel has been opened and all tablets channels are closed then. bool is_finished(); @@ -70,7 +75,30 @@ class LoadChannel { bool is_high_priority() const { return _is_high_priority; } -private: +protected: + Status _get_tablets_channel(std::shared_ptr& channel, + bool& is_finished, + const int64_t index_id); + + template + Status _handle_eos(std::shared_ptr& channel, + const Request& request, + Response* response) { + bool finished = false; + auto index_id = request.index_id(); + RETURN_IF_ERROR(channel->close(request.sender_id(), request.backend_id(), + &finished, request.partition_ids(), + response->mutable_tablet_vec())); + if (finished) { + std::lock_guard l(_lock); + _tablets_channels.erase(index_id); + _finished_channel_ids.emplace(index_id); + } + return Status::OK(); + } + + +protected: // when mem consumption exceeds limit, should call this method to find the channel // that consumes the largest memory(, and then we can reduce its memory usage). bool _find_largest_consumption_channel(std::shared_ptr* channel); diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp index e5e23f511019b8..c2d5d053f3a390 100644 --- a/be/src/runtime/load_channel_mgr.cpp +++ b/be/src/runtime/load_channel_mgr.cpp @@ -18,7 +18,6 @@ #include "runtime/load_channel_mgr.h" #include "gutil/strings/substitute.h" -#include "olap/lru_cache.h" #include "runtime/load_channel.h" #include "runtime/mem_tracker.h" #include "service/backend_options.h" @@ -93,6 +92,13 @@ Status LoadChannelMgr::init(int64_t process_mem_limit) { return Status::OK(); } +LoadChannel* +LoadChannelMgr::_create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, + const std::shared_ptr& mem_tracker, bool is_high_priority, + const std::string& sender_ip) { + return new LoadChannel(load_id, mem_limit, timeout_s, mem_tracker, is_high_priority, sender_ip); +} + Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { UniqueId load_id(params.id()); std::shared_ptr channel; @@ -124,27 +130,27 @@ Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { static void dummy_deleter(const CacheKey& key, void* value) {} +void LoadChannelMgr::_finish_load_channel(const UniqueId load_id) { + VLOG_NOTICE << "removing load channel " << load_id << " because it's finished"; + { + std::lock_guard l(_lock); + _load_channels.erase(load_id); + auto handle = + _last_success_channel->insert(load_id.to_string(), nullptr, 1, dummy_deleter); + _last_success_channel->release(handle); + } + VLOG_CRITICAL << "removed load channel " << load_id; +} + Status LoadChannelMgr::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { UniqueId load_id(request.id()); // 1. get load channel std::shared_ptr channel; - { - std::lock_guard l(_lock); - auto it = _load_channels.find(load_id); - if (it == _load_channels.end()) { - auto handle = _last_success_channel->lookup(load_id.to_string()); - // success only when eos be true - if (handle != nullptr) { - _last_success_channel->release(handle); - if (request.has_eos() && request.eos()) { - return Status::OK(); - } - } - return Status::InternalError(strings::Substitute( - "fail to add batch in load channel. unknown load_id=$0", load_id.to_string())); - } - channel = it->second; + bool is_eof; + auto status = _get_load_channel(channel, is_eof, load_id, request); + if (!status.ok() || is_eof) { + return status; } if (!channel->is_high_priority()) { @@ -161,15 +167,7 @@ Status LoadChannelMgr::add_batch(const PTabletWriterAddBatchRequest& request, // 4. handle finish if (channel->is_finished()) { - VLOG_NOTICE << "removing load channel " << load_id << " because it's finished"; - { - std::lock_guard l(_lock); - _load_channels.erase(load_id); - auto handle = - _last_success_channel->insert(load_id.to_string(), nullptr, 1, dummy_deleter); - _last_success_channel->release(handle); - } - VLOG_CRITICAL << "removed load channel " << load_id; + _finish_load_channel(load_id); } return Status::OK(); } diff --git a/be/src/runtime/load_channel_mgr.h b/be/src/runtime/load_channel_mgr.h index 1da0ec75a5fc05..018b54e83f4203 100644 --- a/be/src/runtime/load_channel_mgr.h +++ b/be/src/runtime/load_channel_mgr.h @@ -32,6 +32,7 @@ #include "util/countdown_latch.h" #include "util/thread.h" #include "util/uid_util.h" +#include "olap/lru_cache.h" namespace doris { @@ -43,7 +44,7 @@ class LoadChannel; class LoadChannelMgr { public: LoadChannelMgr(); - ~LoadChannelMgr(); + virtual ~LoadChannelMgr(); Status init(int64_t process_mem_limit); @@ -56,14 +57,43 @@ class LoadChannelMgr { // cancel all tablet stream for 'load_id' load Status cancel(const PTabletWriterCancelRequest& request); -private: +protected: + virtual LoadChannel* _create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, + const std::shared_ptr& mem_tracker, bool is_high_priority, + const std::string& sender_ip); + + template + Status _get_load_channel(std::shared_ptr& channel, + bool& is_eof, + const UniqueId load_id, + const Request& request) { + is_eof = false; + std::lock_guard l(_lock); + auto it = _load_channels.find(load_id); + if (it == _load_channels.end()) { + auto handle = _last_success_channel->lookup(load_id.to_string()); + // success only when eos be true + if (handle != nullptr) { + _last_success_channel->release(handle); + if (request.has_eos() && request.eos()) { + is_eof = true; + return Status::OK(); + } + } + return Status::InternalError(strings::Substitute( + "fail to add batch in load channel. unknown load_id=$0", load_id.to_string())); + } + channel = it->second; + return Status::OK(); + } + void _finish_load_channel(const UniqueId load_id); // check if the total load mem consumption exceeds limit. // If yes, it will pick a load channel to try to reduce memory consumption. void _handle_mem_exceed_limit(); Status _start_bg_worker(); -private: +protected: // lock protect the load channel map std::mutex _lock; // load id -> load channel diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index e4e0b78a1b12cf..e281892526bdf8 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -18,7 +18,6 @@ #include "runtime/tablets_channel.h" #include "exec/tablet_info.h" -#include "gutil/strings/substitute.h" #include "olap/delta_writer.h" #include "olap/memtable.h" #include "runtime/row_batch.h" @@ -78,25 +77,16 @@ Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { DCHECK(request.tablet_ids_size() == request.row_batch().num_rows()); int64_t cur_seq; - { - std::lock_guard l(_lock); - if (_state != kOpened) { - return _state == kFinished - ? _close_status - : Status::InternalError(strings::Substitute("TabletsChannel $0 state: $1", - _key.to_string(), _state)); - } - cur_seq = _next_seqs[request.sender_id()]; - // check packet - if (request.packet_seq() < cur_seq) { - LOG(INFO) << "packet has already recept before, expect_seq=" << cur_seq - << ", recept_seq=" << request.packet_seq(); - return Status::OK(); - } else if (request.packet_seq() > cur_seq) { - LOG(WARNING) << "lost data packet, expect_seq=" << cur_seq - << ", recept_seq=" << request.packet_seq(); - return Status::InternalError("lost data packet"); - } + + auto status = _get_current_seq(cur_seq, request); + if (UNLIKELY(!status.ok())) { + return status; + } + + if (request.packet_seq() < cur_seq) { + LOG(INFO) << "packet has already recept before, expect_seq=" << cur_seq + << ", recept_seq=" << request.packet_seq(); + return Status::OK(); } RowBatch row_batch(*_row_desc, request.row_batch()); diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h index 226b2b76db05f5..8c2c9c1939c235 100644 --- a/be/src/runtime/tablets_channel.h +++ b/be/src/runtime/tablets_channel.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#pragma once + #include #include #include @@ -28,6 +30,7 @@ #include "util/bitmap.h" #include "util/priority_thread_pool.hpp" #include "util/uid_util.h" +#include "gutil/strings/substitute.h" namespace doris { @@ -63,6 +66,10 @@ class TabletsChannel { // no-op when this channel has been closed or cancelled Status add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response); + virtual Status add_block(const PTabletWriterAddBlockRequest& request, PTabletWriterAddBlockResult* response) { + return Status::NotSupported("Not Implemented add_block"); + } + // Mark sender with 'sender_id' as closed. // If all senders are closed, close this channel, set '*finished' to true, update 'tablet_vec' // to include all tablets written in this channel. @@ -82,11 +89,31 @@ class TabletsChannel { int64_t mem_consumption() const { return _mem_tracker->consumption(); } +protected: + template + Status _get_current_seq(int64_t& cur_seq, const Request& request) { + std::lock_guard l(_lock); + if (_state != kOpened) { + return _state == kFinished + ? _close_status + : Status::InternalError(strings::Substitute("TabletsChannel $0 state: $1", + _key.to_string(), _state)); + } + cur_seq = _next_seqs[request.sender_id()]; + // check packet + if (request.packet_seq() > cur_seq) { + LOG(WARNING) << "lost data packet, expect_seq=" << cur_seq + << ", recept_seq=" << request.packet_seq(); + return Status::InternalError("lost data packet"); + } + return Status::OK(); + } + private: // open all writer - Status _open_all_writers(const PTabletWriterOpenRequest& request); + virtual Status _open_all_writers(const PTabletWriterOpenRequest& request); -private: +protected: // id of this load channel TabletsChannelKey _key; @@ -104,10 +131,13 @@ class TabletsChannel { int64_t _txn_id = -1; int64_t _index_id = -1; OlapTableSchemaParam* _schema = nullptr; + +private: TupleDescriptor* _tuple_desc = nullptr; // row_desc used to construct RowDescriptor* _row_desc = nullptr; +protected: // next sequence we expect int _num_remaining_senders = 0; std::vector _next_seqs; diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index be9e78828045e8..ccbde602fe883b 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -169,6 +169,7 @@ set(VEC_FILES olap/vgeneric_iterators.cpp olap/vcollect_iterator.cpp olap/block_reader.cpp + olap/vdelta_writer.cpp sink/mysql_result_writer.cpp sink/result_sink.cpp sink/vdata_stream_sender.cpp @@ -179,7 +180,10 @@ set(VEC_FILES runtime/vdata_stream_recvr.cpp runtime/vdata_stream_mgr.cpp runtime/vpartition_info.cpp - runtime/vsorted_run_merger.cpp) + runtime/vsorted_run_merger.cpp + runtime/vload_channel.cpp + runtime/vload_channel_mgr.cpp + runtime/vtablets_channel.cpp) add_library(Vec STATIC ${VEC_FILES} diff --git a/be/src/vec/olap/vdelta_writer.cpp b/be/src/vec/olap/vdelta_writer.cpp new file mode 100644 index 00000000000000..519cf489126f7c --- /dev/null +++ b/be/src/vec/olap/vdelta_writer.cpp @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vdelta_writer.h" +#include "olap/storage_engine.h" +#include "olap/memtable.h" + +namespace doris { + +namespace vectorized { + +VDeltaWriter::VDeltaWriter(WriteRequest* req, const std::shared_ptr& parent, + StorageEngine* storage_engine) + : DeltaWriter(req, parent, storage_engine) {} + +VDeltaWriter::~VDeltaWriter() { + +} + +OLAPStatus VDeltaWriter::open(WriteRequest* req, const std::shared_ptr& parent, + VDeltaWriter** writer) { + *writer = new VDeltaWriter(req, parent, StorageEngine::instance()); + return OLAP_SUCCESS; +} + +OLAPStatus VDeltaWriter::write(const vectorized::Block* block, const std::vector& row_idxs) { + if (UNLIKELY(row_idxs.empty())) { + return OLAP_SUCCESS; + } + std::lock_guard l(_lock); + if (!_is_init && !_is_cancelled) { + RETURN_NOT_OK(init()); + } + + if (_is_cancelled) { + return OLAP_ERR_ALREADY_CANCELLED; + } + + int start = 0, end = 0; + + const size_t num_rows = row_idxs.size(); + for (; start < num_rows;) { + auto delta = end + 1 - start; + if (end == num_rows - 1 || (row_idxs[end + 1] - row_idxs[start]) != delta) { + size_t count = delta; + _mem_table->insert(block, start, count); + start += count; + end = start; + } else { + end++; + } + } + + if (_mem_table->memory_usage() >= config::write_buffer_size) { + RETURN_NOT_OK(_flush_memtable_async()); + _reset_mem_table(); + } + + return OLAP_SUCCESS; +} + +} // namespace vectorized + +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/olap/vdelta_writer.h b/be/src/vec/olap/vdelta_writer.h new file mode 100644 index 00000000000000..a3c8eb4e9f7d3e --- /dev/null +++ b/be/src/vec/olap/vdelta_writer.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/delta_writer.h" + +namespace doris { + +namespace vectorized { + +class VDeltaWriter : public DeltaWriter { +public: + virtual ~VDeltaWriter() override; + + static OLAPStatus open(WriteRequest* req, const std::shared_ptr& parent, + VDeltaWriter** writer); + + virtual OLAPStatus write(const vectorized::Block* block, const std::vector& row_idxs) override; + +private: + VDeltaWriter(WriteRequest* req, const std::shared_ptr& parent, + StorageEngine* storage_engine); +}; + +} // namespace vectorized + +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/runtime/vload_channel.cpp b/be/src/vec/runtime/vload_channel.cpp new file mode 100644 index 00000000000000..efca66a1d9507b --- /dev/null +++ b/be/src/vec/runtime/vload_channel.cpp @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vload_channel.h" +#include "vtablets_channel.h" + +namespace doris { + +namespace vectorized { + +VLoadChannel::VLoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, + const std::shared_ptr& mem_tracker, bool is_high_priority, + const std::string& sender_ip) + : LoadChannel(load_id, mem_limit, timeout_s, mem_tracker, is_high_priority, sender_ip) { +} + +Status VLoadChannel::open(const PTabletWriterOpenRequest& params) { + int64_t index_id = params.index_id(); + std::shared_ptr channel; + { + std::lock_guard l(_lock); + auto it = _tablets_channels.find(index_id); + if (it != _tablets_channels.end()) { + channel = it->second; + } else { + // create a new tablets channel + TabletsChannelKey key(params.id(), index_id); + channel.reset(new VTabletsChannel(key, _mem_tracker, _is_high_priority)); + _tablets_channels.insert({index_id, channel}); + } + } + + RETURN_IF_ERROR(channel->open(params)); + + _opened = true; + _last_updated_time.store(time(nullptr)); + return Status::OK(); +} + +Status VLoadChannel::add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) { + int64_t index_id = request.index_id(); + // 1. get tablets channel + std::shared_ptr channel; + bool is_finished; + Status st = _get_tablets_channel(channel, is_finished, index_id); + if (!st.ok() || is_finished) { + return st; + } + + // 2. check if mem consumption exceed limit + handle_mem_exceed_limit(false); + + // 3. add batch to tablets channel + if (request.has_block()) { + RETURN_IF_ERROR(channel->add_block(request, response)); + } + + // 4. handle eos + if (request.has_eos() && request.eos()) { + if (request.has_eos() && request.eos()) { + st = _handle_eos(channel, request, response); + if (!st.ok()) { + return st; + } + } + } + _last_updated_time.store(time(nullptr)); + return st; +} + +} // namespace vectorized + +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/runtime/vload_channel.h b/be/src/vec/runtime/vload_channel.h new file mode 100644 index 00000000000000..9de359236d29f3 --- /dev/null +++ b/be/src/vec/runtime/vload_channel.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/load_channel.h" + +namespace doris { + +namespace vectorized { + +class VLoadChannel : public LoadChannel { +public: + VLoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, + const std::shared_ptr& mem_tracker, bool is_high_priority, + const std::string& sender_ip); + + virtual ~VLoadChannel() override = default; + + virtual Status open(const PTabletWriterOpenRequest& request) override; + + virtual Status add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) override; + +}; + +} // namespace vectorized + +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/runtime/vload_channel_mgr.cpp b/be/src/vec/runtime/vload_channel_mgr.cpp new file mode 100644 index 00000000000000..daf57c855b83af --- /dev/null +++ b/be/src/vec/runtime/vload_channel_mgr.cpp @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/runtime/vload_channel_mgr.h" +#include "vec/runtime/vload_channel.h" + +namespace doris { + +namespace vectorized { + +VLoadChannelMgr::VLoadChannelMgr() : LoadChannelMgr() {} + +VLoadChannelMgr::~VLoadChannelMgr() {} + +LoadChannel* +VLoadChannelMgr::_create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, + const std::shared_ptr& mem_tracker, bool is_high_priority, + const std::string& sender_ip) { + return new VLoadChannel(load_id, mem_limit, timeout_s, mem_tracker, is_high_priority, sender_ip); +} + +Status VLoadChannelMgr::add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) { + UniqueId load_id(request.id()); + // 1. get load channel + std::shared_ptr channel; + bool is_eof; + auto status = _get_load_channel(channel, is_eof, load_id, request); + if (!status.ok() || is_eof) { + return status; + } + + if (!channel->is_high_priority()) { + // 2. check if mem consumption exceed limit + // If this is a high priority load task, do not handle this. + // because this may block for a while, which may lead to rpc timeout. + _handle_mem_exceed_limit(); + } + + // 3. add batch to load channel + // batch may not exist in request(eg: eos request without batch), + // this case will be handled in load channel's add batch method. + RETURN_IF_ERROR(channel->add_block(request, response)); + + // 4. handle finish + if (channel->is_finished()) { + _finish_load_channel(load_id); + } + return Status::OK(); + +} + +} // namespace vectorized + +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/runtime/vload_channel_mgr.h b/be/src/vec/runtime/vload_channel_mgr.h new file mode 100644 index 00000000000000..2828d5ad702322 --- /dev/null +++ b/be/src/vec/runtime/vload_channel_mgr.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/load_channel_mgr.h" + +namespace doris { + +class Cache; +class LoadChannel; + +namespace vectorized { + +class VLoadChannelMgr : public LoadChannelMgr { +public: + VLoadChannelMgr(); + virtual ~VLoadChannelMgr() override; + + Status add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response); +protected: + LoadChannel* _create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, + const std::shared_ptr& mem_tracker, bool is_high_priority, + const std::string& sender_ip) override; +}; + +} // namespace vectorized + +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/runtime/vtablets_channel.cpp b/be/src/vec/runtime/vtablets_channel.cpp new file mode 100644 index 00000000000000..275f68a76533ab --- /dev/null +++ b/be/src/vec/runtime/vtablets_channel.cpp @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vtablets_channel.h" +#include "exec/tablet_info.h" +#include "gutil/strings/substitute.h" +#include "vec/olap/vdelta_writer.h" +#include "olap/memtable.h" +#include "runtime/row_batch.h" +#include "runtime/tuple_row.h" +#include "util/doris_metrics.h" + +namespace doris { + +namespace vectorized { + +VTabletsChannel::VTabletsChannel(const TabletsChannelKey& key, + const std::shared_ptr& mem_tracker, + bool is_high_priority) + : TabletsChannel(key, mem_tracker, is_high_priority) {} + +Status VTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request) { + std::vector* index_slots = nullptr; + int32_t schema_hash = 0; + for (auto& index : _schema->indexes()) { + if (index->index_id == _index_id) { + index_slots = &index->slots; + schema_hash = index->schema_hash; + break; + } + } + if (index_slots == nullptr) { + std::stringstream ss; + ss << "unknown index id, key=" << _key; + return Status::InternalError(ss.str()); + } + for (auto& tablet : request.tablets()) { + WriteRequest wrequest; + wrequest.tablet_id = tablet.tablet_id(); + wrequest.schema_hash = schema_hash; + wrequest.write_type = WriteType::LOAD; + wrequest.txn_id = _txn_id; + wrequest.partition_id = tablet.partition_id(); + wrequest.load_id = request.id(); + wrequest.need_gen_rollup = request.need_gen_rollup(); + wrequest.slots = index_slots; + wrequest.is_high_priority = _is_high_priority; + + VDeltaWriter* writer = nullptr; + auto st = VDeltaWriter::open(&wrequest, _mem_tracker, &writer); + if (st != OLAP_SUCCESS) { + std::stringstream ss; + ss << "open delta writer failed, tablet_id=" << tablet.tablet_id() + << ", txn_id=" << _txn_id << ", partition_id=" << tablet.partition_id() + << ", err=" << st; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + _tablet_writers.emplace(tablet.tablet_id(), writer); + } + _s_tablet_writer_count += _tablet_writers.size(); + DCHECK_EQ(_tablet_writers.size(), request.tablets_size()); + return Status::OK(); +} + +Status VTabletsChannel::add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) { + int64_t cur_seq; + + auto status = _get_current_seq(cur_seq, request); + if (UNLIKELY(!status.ok())) { + return status; + } + + if (request.packet_seq() < cur_seq) { + LOG(INFO) << "packet has already recept before, expect_seq=" << cur_seq + << ", recept_seq=" << request.packet_seq(); + return Status::OK(); + } + + Block block(request.block()); + + std::unordered_map /* row index */> tablet_to_rowidxs; + for (int i = 0; i < request.tablet_ids_size(); ++i) { + int64_t tablet_id = request.tablet_ids(i); + if (_broken_tablets.find(tablet_id) != _broken_tablets.end()) { + // skip broken tablets + continue; + } + auto it = tablet_to_rowidxs.find(tablet_id); + if (it == tablet_to_rowidxs.end()) { + tablet_to_rowidxs.emplace(tablet_id, std::initializer_list{ i }); + } else { + it->second.emplace_back(i); + } + } + + google::protobuf::RepeatedPtrField* tablet_errors = response->mutable_tablet_errors(); + for (const auto& tablet_to_rowidxs_it : tablet_to_rowidxs) { + auto tablet_writer_it = _tablet_writers.find(tablet_to_rowidxs_it.first); + if (tablet_writer_it == _tablet_writers.end()) { + return Status::InternalError( + strings::Substitute("unknown tablet to append data, tablet=$0", tablet_to_rowidxs_it.first)); + } + + OLAPStatus st = tablet_writer_it->second->write(&block, tablet_to_rowidxs_it.second); + if (st != OLAP_SUCCESS) { + auto err_msg = strings::Substitute( + "tablet writer write failed, tablet_id=$0, txn_id=$1, err=$2", + tablet_to_rowidxs_it.first, _txn_id, st); + LOG(WARNING) << err_msg; + PTabletError* error = tablet_errors->Add(); + error->set_tablet_id(tablet_to_rowidxs_it.first); + error->set_msg(err_msg); + _broken_tablets.insert(tablet_to_rowidxs_it.first); + // continue write to other tablet. + // the error will return back to sender. + } + } + + { + std::lock_guard l(_lock); + _next_seqs[request.sender_id()] = cur_seq + 1; + } + return Status::OK(); +} + +} // namespace vectorized + +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/runtime/vtablets_channel.h b/be/src/vec/runtime/vtablets_channel.h new file mode 100644 index 00000000000000..1f7d31c1ba0238 --- /dev/null +++ b/be/src/vec/runtime/vtablets_channel.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/tablets_channel.h" + +namespace doris { + +namespace vectorized { + +class VTabletsChannel : public TabletsChannel { + +public: + VTabletsChannel(const TabletsChannelKey& key, const std::shared_ptr& mem_tracker, bool is_high_priority); + + virtual Status add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) override; + +private: + virtual Status _open_all_writers(const PTabletWriterOpenRequest& request) override; +}; + +} // namespace vectorized + +} // namespace doris \ No newline at end of file From a3603a98b697848ddb060515e9175698009a7c4f Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Wed, 9 Mar 2022 11:29:15 +0800 Subject: [PATCH 03/32] Add implement of tablet_writer_add_block --- be/src/runtime/load_channel_mgr.h | 5 +++++ be/src/service/internal_service.cpp | 31 ++++++++++++++++++++++++++ be/src/service/internal_service.h | 5 +++++ be/src/vec/runtime/vload_channel_mgr.h | 4 ++-- 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/be/src/runtime/load_channel_mgr.h b/be/src/runtime/load_channel_mgr.h index 018b54e83f4203..b8cd673246f38d 100644 --- a/be/src/runtime/load_channel_mgr.h +++ b/be/src/runtime/load_channel_mgr.h @@ -53,6 +53,11 @@ class LoadChannelMgr { Status add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response); + + virtual Status add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) { + return Status::NotSupported("Not Implemented add_block"); + } // cancel all tablet stream for 'load_id' load Status cancel(const PTabletWriterCancelRequest& request); diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 4cb6b8f7ee84c9..b7917ce97dfa51 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -111,6 +111,37 @@ void PInternalServiceImpl::exec_plan_fragment(google::protobuf::RpcController st.to_protobuf(response->mutable_status()); } +template +void PInternalServiceImpl::tablet_writer_add_block(google::protobuf::RpcController* cntl_base, + const PTabletWriterAddBlockRequest* request, + PTabletWriterAddBlockResult* response, + google::protobuf::Closure* done) { + VLOG_RPC << "tablet writer add block, id=" << request->id() + << ", index_id=" << request->index_id() << ", sender_id=" << request->sender_id() + << ", current_queued_size=" << _tablet_worker_pool.get_queue_size(); + int64_t submit_task_time_ns = MonotonicNanos(); + _tablet_worker_pool.offer([cntl_base, request, response, done, submit_task_time_ns, this]() { + int64_t wait_execution_time_ns = MonotonicNanos() - submit_task_time_ns; + brpc::ClosureGuard closure_guard(done); + int64_t execution_time_ns = 0; + { + SCOPED_RAW_TIMER(&execution_time_ns); + brpc::Controller* cntl = static_cast(cntl_base); + attachment_transfer_request_block(request, cntl); + auto st = _exec_env->load_channel_mgr()->add_block(*request, response); + if (!st.ok()) { + LOG(WARNING) << "tablet writer add block failed, message=" << st.get_error_msg() + << ", id=" << request->id() << ", index_id=" << request->index_id() + << ", sender_id=" << request->sender_id() + << ", backend id=" << request->backend_id(); + } + st.to_protobuf(response->mutable_status()); + } + response->set_execution_time_us(execution_time_ns / NANOS_PER_MICRO); + response->set_wait_execution_time_us(wait_execution_time_ns / NANOS_PER_MICRO); + }); +} + template void PInternalServiceImpl::tablet_writer_add_batch(google::protobuf::RpcController* cntl_base, const PTabletWriterAddBatchRequest* request, diff --git a/be/src/service/internal_service.h b/be/src/service/internal_service.h index c4073bf86ed282..ce4913701d0bff 100644 --- a/be/src/service/internal_service.h +++ b/be/src/service/internal_service.h @@ -64,6 +64,11 @@ class PInternalServiceImpl : public T { PTabletWriterAddBatchResult* response, google::protobuf::Closure* done) override; + void tablet_writer_add_block(google::protobuf::RpcController* controller, + const PTabletWriterAddBlockRequest* request, + PTabletWriterAddBlockResult* response, + google::protobuf::Closure* done) override; + void tablet_writer_cancel(google::protobuf::RpcController* controller, const PTabletWriterCancelRequest* request, PTabletWriterCancelResult* response, diff --git a/be/src/vec/runtime/vload_channel_mgr.h b/be/src/vec/runtime/vload_channel_mgr.h index 2828d5ad702322..f0ea13a786e5c2 100644 --- a/be/src/vec/runtime/vload_channel_mgr.h +++ b/be/src/vec/runtime/vload_channel_mgr.h @@ -31,8 +31,8 @@ class VLoadChannelMgr : public LoadChannelMgr { VLoadChannelMgr(); virtual ~VLoadChannelMgr() override; - Status add_block(const PTabletWriterAddBlockRequest& request, - PTabletWriterAddBlockResult* response); + virtual Status add_block(const PTabletWriterAddBlockRequest& request, + PTabletWriterAddBlockResult* response) override; protected: LoadChannel* _create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, const std::shared_ptr& mem_tracker, bool is_high_priority, From 3b31891130ea043483628555a946ff17e6d4b092 Mon Sep 17 00:00:00 2001 From: jacktengg Date: Mon, 7 Mar 2022 22:35:51 +0800 Subject: [PATCH 04/32] stream load vectorization: vectorized broker scan and tablet sink --- be/src/exec/base_scanner.h | 10 + be/src/exec/broker_scan_node.cpp | 14 +- be/src/exec/broker_scan_node.h | 13 +- be/src/exec/broker_scanner.cpp | 6 - be/src/exec/broker_scanner.h | 16 +- be/src/exec/exec_node.cpp | 8 +- be/src/exec/tablet_sink.cpp | 280 +++++++---------- be/src/exec/tablet_sink.h | 72 +++-- be/src/olap/memtable.cpp | 3 + be/src/runtime/exec_env_init.cpp | 3 +- be/src/vec/CMakeLists.txt | 2 + be/src/vec/exec/vbroker_scan_node.cpp | 238 ++++++++++++++ be/src/vec/exec/vbroker_scan_node.h | 53 ++++ be/src/vec/exec/vbroker_scanner.cpp | 293 ++++++++++++++++++ be/src/vec/exec/vbroker_scanner.h | 39 +++ be/src/vec/sink/vtablet_sink.cpp | 291 ++++++++++++++++- be/src/vec/sink/vtablet_sink.h | 54 ++++ .../doris/planner/StreamLoadPlanner.java | 2 + 18 files changed, 1182 insertions(+), 215 deletions(-) create mode 100644 be/src/vec/exec/vbroker_scan_node.cpp create mode 100644 be/src/vec/exec/vbroker_scan_node.h create mode 100644 be/src/vec/exec/vbroker_scanner.cpp create mode 100644 be/src/vec/exec/vbroker_scanner.h diff --git a/be/src/exec/base_scanner.h b/be/src/exec/base_scanner.h index bce0f4b8ca694f..7dc398c40a9aa7 100644 --- a/be/src/exec/base_scanner.h +++ b/be/src/exec/base_scanner.h @@ -33,6 +33,11 @@ class MemTracker; class RuntimeState; class ExprContext; +namespace vectorized { +class IColumn; +using MutableColumnPtr = IColumn::MutablePtr; +} + // The counter will be passed to each scanner. // Note that this struct is not thread safe. // So if we support concurrent scan in the future, we need to modify this struct. @@ -56,6 +61,11 @@ class BaseScanner { // Get next tuple virtual Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool *fill_tuple) = 0; + // Get next block + virtual Status get_next(std::vector& columns, bool* eof) { + return Status::NotSupported("Not Implemented get block"); + } + // Close this scanner virtual void close() = 0; Status fill_dest_tuple(Tuple* dest_tuple, MemPool* mem_pool); diff --git a/be/src/exec/broker_scan_node.cpp b/be/src/exec/broker_scan_node.cpp index 6e156f8d836d5e..0d6d777a6b957a 100644 --- a/be/src/exec/broker_scan_node.cpp +++ b/be/src/exec/broker_scan_node.cpp @@ -21,7 +21,7 @@ #include #include "common/object_pool.h" -#include "exec/broker_scanner.h" +#include "vec/exec/vbroker_scanner.h" #include "exec/json_scanner.h" #include "exec/orc_scanner.h" #include "exec/parquet_scanner.h" @@ -236,9 +236,15 @@ std::unique_ptr BrokerScanNode::create_scanner(const TBrokerScanRan _pre_filter_texprs, counter); break; default: - scan = new BrokerScanner(_runtime_state, runtime_profile(), scan_range.params, - scan_range.ranges, scan_range.broker_addresses, - _pre_filter_texprs, counter); + if (_vectorized) { + scan = new vectorized::VBrokerScanner(_runtime_state, runtime_profile(), scan_range.params, + scan_range.ranges, scan_range.broker_addresses, + _pre_filter_texprs, counter); + } else { + scan = new BrokerScanner(_runtime_state, runtime_profile(), scan_range.params, + scan_range.ranges, scan_range.broker_addresses, + _pre_filter_texprs, counter); + } } std::unique_ptr scanner(scan); return scanner; diff --git a/be/src/exec/broker_scan_node.h b/be/src/exec/broker_scan_node.h index c4494731d87b18..68adbf007c4433 100644 --- a/be/src/exec/broker_scan_node.h +++ b/be/src/exec/broker_scan_node.h @@ -65,7 +65,6 @@ class BrokerScanNode : public ScanNode { // Write debug string of this into out. virtual void debug_string(int indentation_level, std::stringstream* out) const override; -private: // Update process status to one failed status, // NOTE: Must hold the mutex of this scan node bool update_status(const Status& new_status) { @@ -76,8 +75,12 @@ class BrokerScanNode : public ScanNode { return false; } + std::unique_ptr create_scanner(const TBrokerScanRange& scan_range, + ScannerCounter* counter); + +private: // Create scanners to do scan job - Status start_scanners(); + virtual Status start_scanners(); // One scanner worker, This scanner will handle 'length' ranges start from start_idx void scanner_worker(int start_idx, int length); @@ -86,10 +89,8 @@ class BrokerScanNode : public ScanNode { Status scanner_scan(const TBrokerScanRange& scan_range, const std::vector& conjunct_ctxs, ScannerCounter* counter); - std::unique_ptr create_scanner(const TBrokerScanRange& scan_range, - ScannerCounter* counter); - -private: +protected: + bool _vectorized = false; TupleId _tuple_id; RuntimeState* _runtime_state; TupleDescriptor* _tuple_desc; diff --git a/be/src/exec/broker_scanner.cpp b/be/src/exec/broker_scanner.cpp index e7b85a013fac6a..efd86580e63e23 100644 --- a/be/src/exec/broker_scanner.cpp +++ b/be/src/exec/broker_scanner.cpp @@ -30,8 +30,6 @@ #include "exec/plain_binary_line_reader.h" #include "exec/plain_text_line_reader.h" #include "exec/s3_reader.h" -#include "exec/text_converter.h" -#include "exec/text_converter.hpp" #include "exprs/expr.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" @@ -81,10 +79,6 @@ BrokerScanner::~BrokerScanner() { Status BrokerScanner::open() { RETURN_IF_ERROR(BaseScanner::open()); // base default function - _text_converter.reset(new (std::nothrow) TextConverter('\\')); - if (_text_converter == nullptr) { - return Status::InternalError("No memory error."); - } return Status::OK(); } diff --git a/be/src/exec/broker_scanner.h b/be/src/exec/broker_scanner.h index b831539584b75d..c359f6c8a16646 100644 --- a/be/src/exec/broker_scanner.h +++ b/be/src/exec/broker_scanner.h @@ -56,7 +56,7 @@ class BrokerScanner : public BaseScanner { const TBrokerScanRangeParams& params, const std::vector& ranges, const std::vector& broker_addresses, const std::vector& pre_filter_texprs, ScannerCounter* counter); - ~BrokerScanner(); + virtual ~BrokerScanner(); // Open this scanner, will initialize information need to Status open() override; @@ -67,12 +67,16 @@ class BrokerScanner : public BaseScanner { // Close this scanner void close() override; +protected: + // Read next buffer from reader + Status open_next_reader(); + + Status _line_to_src_tuple(const Slice& line); + private: Status open_file_reader(); Status create_decompressor(TFileFormatType::type type); Status open_line_reader(); - // Read next buffer from reader - Status open_next_reader(); // Split one text line to values void split_line(const Slice& line); @@ -88,14 +92,10 @@ class BrokerScanner : public BaseScanner { // output is tuple Status _convert_one_row(const Slice& line, Tuple* tuple, MemPool* tuple_pool); - Status _line_to_src_tuple(const Slice& line); - -private: +protected: const std::vector& _ranges; const std::vector& _broker_addresses; - std::unique_ptr _text_converter; - std::string _value_separator; std::string _line_delimiter; TFileFormatType::type _file_format_type; diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index e3891b6139d887..85c8e5f08ee69c 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -80,6 +80,7 @@ #include "vec/exec/vsort_node.h" #include "vec/exec/vtable_function_node.h" #include "vec/exec/vunion_node.h" +#include "vec/exec/vbroker_scan_node.h" #include "vec/exprs/vexpr.h" namespace doris { @@ -390,6 +391,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN case TPlanNodeType::SELECT_NODE: case TPlanNodeType::REPEAT_NODE: case TPlanNodeType::TABLE_FUNCTION_NODE: + case TPlanNodeType::BROKER_SCAN_NODE: break; default: { const auto& i = _TPlanNodeType_VALUES_TO_NAMES.find(tnode.node_type); @@ -553,7 +555,11 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN return Status::OK(); case TPlanNodeType::BROKER_SCAN_NODE: - *node = pool->add(new BrokerScanNode(pool, tnode, descs)); + if (state->enable_vectorized_exec()) { + *node = pool->add(new vectorized::VBrokerScanNode(pool, tnode, descs)); + } else { + *node = pool->add(new BrokerScanNode(pool, tnode, descs)); + } return Status::OK(); case TPlanNodeType::REPEAT_NODE: diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index 821e3d50eff3c4..56ee842f957058 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -42,6 +42,9 @@ #include "util/time.h" #include "util/uid_util.h" +#include "vec/core/block.h" +#include "vec/sink/vtablet_sink.h" + namespace doris { namespace stream_load { @@ -66,7 +69,9 @@ NodeChannel::~NodeChannel() noexcept { delete _add_batch_closure; _add_batch_closure = nullptr; } - _cur_add_batch_request.release_id(); + if (!_is_vectorized) { + _cur_add_batch_request.release_id(); + } } // if "_cancelled" is set to true, @@ -86,7 +91,6 @@ Status NodeChannel::init(RuntimeState* state) { _row_desc.reset(new RowDescriptor(_tuple_desc, false)); _batch_size = state->batch_size(); - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); _stub = state->exec_env()->brpc_internal_client_cache()->get_client(_node_info.host, _node_info.brpc_port); @@ -97,12 +101,18 @@ Status NodeChannel::init(RuntimeState* state) { return Status::InternalError("get rpc stub failed"); } - // Initialize _cur_add_batch_request - _cur_add_batch_request.set_allocated_id(&_parent->_load_id); - _cur_add_batch_request.set_index_id(_index_channel->_index_id); - _cur_add_batch_request.set_sender_id(_parent->_sender_id); - _cur_add_batch_request.set_backend_id(_node_id); - _cur_add_batch_request.set_eos(false); + if (!_is_vectorized) { + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + + // Initialize _cur_add_batch_request + _cur_add_batch_request.set_allocated_id(&_parent->_load_id); + _cur_add_batch_request.set_index_id(_index_channel->_index_id); + _cur_add_batch_request.set_sender_id(_parent->_sender_id); + _cur_add_batch_request.set_backend_id(_node_id); + _cur_add_batch_request.set_eos(false); + + _name = fmt::format("NodeChannel[{}-{}]", _index_channel->_index_id, _node_id); + } _rpc_timeout_ms = state->query_options().query_timeout * 1000; _timeout_watch.start(); @@ -110,7 +120,6 @@ Status NodeChannel::init(RuntimeState* state) { _load_info = "load_id=" + print_id(_parent->_load_id) + ", txn_id=" + std::to_string(_parent->_txn_id); - _name = fmt::format("NodeChannel[{}-{}]", _index_channel->_index_id, _node_id); return Status::OK(); } @@ -184,67 +193,55 @@ Status NodeChannel::open_wait() { return status; } - // add batch closure - _add_batch_closure = ReusableClosure::create(); - _add_batch_closure->addFailedHandler([this](bool is_last_rpc) { - std::lock_guard l(this->_closed_lock); - if (this->_is_closed) { - // if the node channel is closed, no need to call `mark_as_failed`, - // and notice that _index_channel may already be destroyed. - return; - } - // If rpc failed, mark all tablets on this node channel as failed - _index_channel->mark_as_failed(this->node_id(), this->host(), - _add_batch_closure->cntl.ErrorText(), -1); - Status st = _index_channel->check_intolerable_failure(); - if (!st.ok()) { - _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); - } else if (is_last_rpc) { - // if this is last rpc, will must set _add_batches_finished. otherwise, node channel's close_wait - // will be blocked. - _add_batches_finished = true; - } - }); - - _add_batch_closure->addSuccessHandler([this](const PTabletWriterAddBatchResult& result, - bool is_last_rpc) { - std::lock_guard l(this->_closed_lock); - if (this->_is_closed) { - // if the node channel is closed, no need to call the following logic, - // and notice that _index_channel may already be destroyed. - return; - } - Status status(result.status()); - if (status.ok()) { - // if has error tablet, handle them first - for (auto& error : result.tablet_errors()) { - _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), - error.tablet_id()); - } - + if (!_is_vectorized) { + // add batch closure + _add_batch_closure = ReusableClosure::create(); + _add_batch_closure->addFailedHandler([this](bool is_last_rpc) { + // If rpc failed, mark all tablets on this node channel as failed + _index_channel->mark_as_failed(this->node_id(), this->host(), _add_batch_closure->cntl.ErrorText(), -1); Status st = _index_channel->check_intolerable_failure(); if (!st.ok()) { - _cancel_with_msg(st.get_error_msg()); + _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); } else if (is_last_rpc) { - for (auto& tablet : result.tablet_vec()) { - TTabletCommitInfo commit_info; - commit_info.tabletId = tablet.tablet_id(); - commit_info.backendId = _node_id; - _tablet_commit_infos.emplace_back(std::move(commit_info)); - } + // if this is last rpc, will must set _add_batches_finished. otherwise, node channel's close_wait + // will be blocked. _add_batches_finished = true; } - } else { - _cancel_with_msg(fmt::format("{}, add batch req success but status isn't ok, err: {}", - channel_info(), status.get_error_msg())); - } + }); - if (result.has_execution_time_us()) { - _add_batch_counter.add_batch_execution_time_us += result.execution_time_us(); - _add_batch_counter.add_batch_wait_execution_time_us += result.wait_execution_time_us(); - _add_batch_counter.add_batch_num++; - } - }); + _add_batch_closure->addSuccessHandler([this](const PTabletWriterAddBatchResult& result, + bool is_last_rpc) { + Status status(result.status()); + if (status.ok()) { + // if has error tablet, handle them first + for (auto& error : result.tablet_errors()) { + _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), error.tablet_id()); + } + + Status st = _index_channel->check_intolerable_failure(); + if (!st.ok()) { + _cancel_with_msg(st.get_error_msg()); + } else if (is_last_rpc) { + for (auto& tablet : result.tablet_vec()) { + TTabletCommitInfo commit_info; + commit_info.tabletId = tablet.tablet_id(); + commit_info.backendId = _node_id; + _tablet_commit_infos.emplace_back(std::move(commit_info)); + } + _add_batches_finished = true; + } + } else { + _cancel_with_msg(fmt::format("{}, add batch req success but status isn't ok, err: {}", + channel_info(), status.get_error_msg())); + } + + if (result.has_execution_time_us()) { + _add_batch_counter.add_batch_execution_time_us += result.execution_time_us(); + _add_batch_counter.add_batch_wait_execution_time_us += result.wait_execution_time_us(); + _add_batch_counter.add_batch_num++; + } + }); + } return status; } @@ -297,58 +294,6 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { return Status::OK(); } -// Used for vectorized engine. -// TODO(cmy): deprecated, need refactor -Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { - // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed. - auto st = none_of({_cancelled, _eos_is_produced}); - if (!st.ok()) { - if (_cancelled) { - std::lock_guard l(_cancel_msg_lock); - return Status::InternalError("add row failed. " + _cancel_msg); - } else { - return st.clone_and_prepend("already stopped, can't add row. cancelled/eos: "); - } - } - - // We use OlapTableSink mem_tracker which has the same ancestor of _plan node, - // so in the ideal case, mem limit is a matter for _plan node. - // But there is still some unfinished things, we do mem limit here temporarily. - // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. - // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _pending_batches_num > 0 && - (_pending_batches_bytes > _max_pending_batches_bytes || - _parent->_mem_tracker->any_limit_exceeded())) { - SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); - SleepFor(MonoDelta::FromMilliseconds(10)); - } - - auto row_no = _cur_batch->add_row(); - if (row_no == RowBatch::INVALID_ROW_INDEX) { - { - SCOPED_ATOMIC_TIMER(&_queue_push_lock_ns); - std::lock_guard l(_pending_batches_lock); - _pending_batches_bytes += _cur_batch->tuple_data_pool()->total_reserved_bytes(); - //To simplify the add_row logic, postpone adding batch into req until the time of sending req - _pending_batches.emplace(std::move(_cur_batch), _cur_add_batch_request); - _pending_batches_num++; - } - - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); - _cur_add_batch_request.clear_tablet_ids(); - - row_no = _cur_batch->add_row(); - } - DCHECK_NE(row_no, RowBatch::INVALID_ROW_INDEX); - - _cur_batch->get_row(row_no)->set_tuple( - 0, block_row.first->deep_copy_tuple(*_tuple_desc, _cur_batch->tuple_data_pool(), - block_row.second, 0, true)); - _cur_batch->commit_last_row(); - _cur_add_batch_request.add_tablet_ids(tablet_id); - return Status::OK(); -} - void NodeChannel::mark_close() { auto st = none_of({_cancelled, _eos_is_produced}); if (!st.ok()) { @@ -372,6 +317,11 @@ void NodeChannel::mark_close() { return; } +void NodeChannel::_close_check() { + std::lock_guard lg(_pending_batches_lock); + CHECK(_pending_batches.empty()) << name(); + CHECK(_cur_batch == nullptr) << name(); +} Status NodeChannel::close_wait(RuntimeState* state) { // set _is_closed to true finally Defer set_closed {[&]() { @@ -397,11 +347,7 @@ Status NodeChannel::close_wait(RuntimeState* state) { _close_time_ms = UnixMillis() - _close_time_ms; if (_add_batches_finished) { - { - std::lock_guard lg(_pending_batches_lock); - CHECK(_pending_batches.empty()) << name(); - CHECK(_cur_batch == nullptr) << name(); - } + _close_check(); state->tablet_commit_infos().insert(state->tablet_commit_infos().end(), std::make_move_iterator(_tablet_commit_infos.begin()), std::make_move_iterator(_tablet_commit_infos.end())); @@ -583,7 +529,11 @@ Status IndexChannel::init(RuntimeState* state, const std::vector_pool. // Because the deconstruction of NodeChannel may take a long time to wait rpc finish. // but the ObjectPool will hold a spin lock to delete objects. - channel = std::make_shared(_parent, this, node_id); + if (!_is_vectorized) { + channel = std::make_shared(_parent, this, node_id); + } else { + channel = std::make_shared(_parent, this, node_id); + } _node_channels.emplace(node_id, channel); } else { channel = it->second; @@ -613,20 +563,6 @@ void IndexChannel::add_row(Tuple* tuple, int64_t tablet_id) { } } -// Used for vectorized engine. -// TODO(cmy): deprecated, need refactor -void IndexChannel::add_row(BlockRow& block_row, int64_t tablet_id) { - auto it = _channels_by_tablet.find(tablet_id); - DCHECK(it != _channels_by_tablet.end()) << "unknown tablet, tablet_id=" << tablet_id; - for (auto channel : it->second) { - // if this node channel is already failed, this add_row will be skipped - auto st = channel->add_row(block_row, tablet_id); - if (!st.ok()) { - mark_as_failed(channel->node_id(), channel->host(), st.get_error_msg(), tablet_id); - } - } -} - void IndexChannel::mark_as_failed(int64_t node_id, const std::string& host, const std::string& err, int64_t tablet_id) { const auto& it = _tablets_by_channel.find(node_id); @@ -679,10 +615,15 @@ OlapTableSink::OlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, _input_row_desc(row_desc), _filter_bitmap(1024), _stop_background_threads_latch(1) { - if (!texprs.empty()) { - *status = Expr::create_expr_trees(_pool, texprs, &_output_expr_ctxs); + if (!_is_vectorized) { + if (!texprs.empty()) { + *status = Expr::create_expr_trees(_pool, texprs, &_output_expr_ctxs); + } + _name = "OlapTableSink"; + } + else { + *status = Status::OK(); } - _name = "OlapTableSink"; _transfer_data_by_brpc_attachment = config::transfer_data_by_brpc_attachment; } @@ -747,8 +688,10 @@ Status OlapTableSink::prepare(RuntimeState* state) { state->instance_mem_tracker()); SCOPED_TIMER(_profile->total_time_counter()); - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _input_row_desc, _expr_mem_tracker)); + if (!_is_vectorized) { + // Prepare the exprs to run. + RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _input_row_desc, _expr_mem_tracker)); + } // get table's tuple descriptor _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_desc_id); @@ -756,27 +699,31 @@ Status OlapTableSink::prepare(RuntimeState* state) { LOG(WARNING) << "unknown destination tuple descriptor, id=" << _tuple_desc_id; return Status::InternalError("unknown destination tuple descriptor"); } - if (!_output_expr_ctxs.empty()) { - if (_output_expr_ctxs.size() != _output_tuple_desc->slots().size()) { - LOG(WARNING) << "number of exprs is not same with slots, num_exprs=" - << _output_expr_ctxs.size() - << ", num_slots=" << _output_tuple_desc->slots().size(); - return Status::InternalError("number of exprs is not same with slots"); - } - for (int i = 0; i < _output_expr_ctxs.size(); ++i) { - if (!is_type_compatible(_output_expr_ctxs[i]->root()->type().type, - _output_tuple_desc->slots()[i]->type().type)) { - LOG(WARNING) << "type of exprs is not match slot's, expr_type=" - << _output_expr_ctxs[i]->root()->type().type - << ", slot_type=" << _output_tuple_desc->slots()[i]->type().type - << ", slot_name=" << _output_tuple_desc->slots()[i]->col_name(); - return Status::InternalError("expr's type is not same with slot's"); + + _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false)); + + if (!_is_vectorized) { + if (!_output_expr_ctxs.empty()) { + if (_output_expr_ctxs.size() != _output_tuple_desc->slots().size()) { + LOG(WARNING) << "number of exprs is not same with slots, num_exprs=" + << _output_expr_ctxs.size() + << ", num_slots=" << _output_tuple_desc->slots().size(); + return Status::InternalError("number of exprs is not same with slots"); + } + for (int i = 0; i < _output_expr_ctxs.size(); ++i) { + if (!is_type_compatible(_output_expr_ctxs[i]->root()->type().type, + _output_tuple_desc->slots()[i]->type().type)) { + LOG(WARNING) << "type of exprs is not match slot's, expr_type=" + << _output_expr_ctxs[i]->root()->type().type + << ", slot_type=" << _output_tuple_desc->slots()[i]->type().type + << ", slot_name=" << _output_tuple_desc->slots()[i]->col_name(); + return Status::InternalError("expr's type is not same with slot's"); + } } } - } - _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false)); - _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size())); + _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size(), _mem_tracker.get())); + } _max_decimalv2_val.resize(_output_tuple_desc->slots().size()); _min_decimalv2_val.resize(_output_tuple_desc->slots().size()); @@ -838,7 +785,13 @@ Status OlapTableSink::prepare(RuntimeState* state) { tablets.emplace_back(std::move(tablet_with_partition)); } } - auto channel = std::make_shared(this, index->index_id); + IndexChannel *index_channel; + if (_is_vectorized) { + index_channel = new VIndexChannel(this, index->index_id); + } else { + index_channel = new IndexChannel(this, index->index_id); + } + auto channel = _pool->add(index_channel); RETURN_IF_ERROR(channel->init(state, tablets)); _channels.emplace_back(channel); } @@ -849,8 +802,11 @@ Status OlapTableSink::prepare(RuntimeState* state) { Status OlapTableSink::open(RuntimeState* state) { SCOPED_TIMER(_profile->total_time_counter()); SCOPED_TIMER(_open_timer); - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); + + if (!_is_vectorized) { + // Prepare the exprs to run. + RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); + } for (auto index_channel : _channels) { index_channel->for_each_node_channel( diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h index e0161ba76e9608..b00d896ff1a2e2 100644 --- a/be/src/exec/tablet_sink.h +++ b/be/src/exec/tablet_sink.h @@ -53,6 +53,10 @@ class TupleDescriptor; class ExprContext; class TExpr; +namespace vectorized { +class Block; +class MutableBlock; +} namespace stream_load { class OlapTableSink; @@ -152,25 +156,26 @@ class IndexChannel; class NodeChannel { public: NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id); - ~NodeChannel() noexcept; + virtual ~NodeChannel() noexcept; // called before open, used to add tablet located in this backend void add_tablet(const TTabletWithPartition& tablet) { _all_tablets.emplace_back(tablet); } - Status init(RuntimeState* state); + virtual Status init(RuntimeState* state); // we use open/open_wait to parallel void open(); - Status open_wait(); + virtual Status open_wait(); Status add_row(Tuple* tuple, int64_t tablet_id); - - Status add_row(BlockRow& block_row, int64_t tablet_id); + virtual Status add_row(BlockRow& block_row, int64_t tablet_id) { + LOG(FATAL) << "add block row to NodeChannel not supported"; + } // two ways to stop channel: // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response. // 2. just cancel() - void mark_close(); + virtual void mark_close(); Status close_wait(RuntimeState* state); void cancel(const std::string& cancel_msg); @@ -180,7 +185,7 @@ class NodeChannel { // 1: running, haven't reach eos. // only allow 1 rpc in flight // plz make sure, this func should be called after open_wait(). - int try_send_and_fetch_status(RuntimeState* state, + virtual int try_send_and_fetch_status(RuntimeState* state, std::unique_ptr& thread_pool_token); void try_send_batch(RuntimeState* state); @@ -209,15 +214,21 @@ class NodeChannel { void clear_all_batches(); + virtual void clear_all_blocks() { + LOG(FATAL) << "NodeChannel::clear_all_blocks not supported"; + } + std::string channel_info() const { return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info.host, _node_info.brpc_port); } -private: +protected: void _cancel_with_msg(const std::string& msg); + virtual void _close_check(); -private: +protected: + bool _is_vectorized = false; OlapTableSink* _parent = nullptr; IndexChannel* _index_channel = nullptr; int64_t _node_id = -1; @@ -243,28 +254,23 @@ class NodeChannel { std::atomic _send_finished {false}; // add batches finished means the last rpc has be response, used to check whether this channel can be closed - std::atomic _add_batches_finished {false}; + std::atomic _add_batches_finished {false}; // reuse for vectorized - std::atomic _last_patch_processed_finished {true}; + std::atomic _last_patch_processed_finished {true}; // reuse for vectorized bool _eos_is_produced {false}; // only for restricting producer behaviors std::unique_ptr _row_desc; int _batch_size = 0; - std::unique_ptr _cur_batch; - PTabletWriterAddBatchRequest _cur_add_batch_request; - std::mutex _pending_batches_lock; - using AddBatchReq = std::pair, PTabletWriterAddBatchRequest>; - std::queue _pending_batches; - std::atomic _pending_batches_num {0}; // limit _pending_batches size std::atomic _pending_batches_bytes {0}; size_t _max_pending_batches_bytes {10 * 1024 * 1024}; + std::mutex _pending_batches_lock; // reuse for vectorized + std::atomic _pending_batches_num {0}; // reuse for vectorized std::shared_ptr _stub = nullptr; RefCountClosure* _open_closure = nullptr; - ReusableClosure* _add_batch_closure = nullptr; std::vector _all_tablets; std::vector _tablet_commit_infos; @@ -275,6 +281,7 @@ class NodeChannel { std::atomic _queue_push_lock_ns {0}; std::atomic _actual_consume_ns {0}; +private: // buffer for saving serialized row batch data. // In the non-attachment approach, we need to use two PRowBatch structures alternately // so that when one PRowBatch is sent, the other PRowBatch can be used for the serialization of the next RowBatch. @@ -297,6 +304,11 @@ class NodeChannel { // The IndexChannel is definitely accessible until the NodeChannel is closed. std::mutex _closed_lock; bool _is_closed = false; + std::unique_ptr _cur_batch; + PTabletWriterAddBatchRequest _cur_add_batch_request; + using AddBatchReq = std::pair, PTabletWriterAddBatchRequest>; + std::queue _pending_batches; + ReusableClosure* _add_batch_closure = nullptr; }; class IndexChannel { @@ -304,13 +316,15 @@ class IndexChannel { IndexChannel(OlapTableSink* parent, int64_t index_id) : _parent(parent), _index_id(index_id) { _index_channel_tracker = MemTracker::create_tracker(-1, "IndexChannel"); } - ~IndexChannel(); + virtual ~IndexChannel(); Status init(RuntimeState* state, const std::vector& tablets); void add_row(Tuple* tuple, int64_t tablet_id); - void add_row(BlockRow& block_row, int64_t tablet_id); + virtual void add_row(BlockRow& block_row, int64_t tablet_id) { + LOG(FATAL) << "add block row to IndexChannel not supported"; + } void for_each_node_channel(const std::function&)>& func) { for (auto& it : _node_channels) { @@ -326,9 +340,11 @@ class IndexChannel { size_t num_node_channels() const { return _node_channels.size(); } -private: +protected: friend class NodeChannel; + friend class VNodeChannel; + bool _is_vectorized = false; OlapTableSink* _parent; int64_t _index_id; @@ -401,6 +417,11 @@ class OlapTableSink : public DataSink { friend class NodeChannel; friend class IndexChannel; + friend class VNodeChannel; + friend class VIndexChannel; + + bool _is_vectorized = false; + std::shared_ptr _mem_tracker; ObjectPool* _pool; @@ -415,8 +436,6 @@ class OlapTableSink : public DataSink { // this is tuple descriptor of destination OLAP table TupleDescriptor* _output_tuple_desc = nullptr; RowDescriptor* _output_row_desc = nullptr; - std::vector _output_expr_ctxs; - std::unique_ptr _output_batch; bool _need_validate_data = false; @@ -429,7 +448,6 @@ class OlapTableSink : public DataSink { // TODO(zc): think about cache this data std::shared_ptr _schema; - OlapTablePartitionParam* _partition = nullptr; OlapTableLocationParam* _location = nullptr; DorisNodesInfo* _nodes_info = nullptr; @@ -455,7 +473,6 @@ class OlapTableSink : public DataSink { int64_t _convert_batch_ns = 0; int64_t _validate_data_ns = 0; int64_t _send_data_ns = 0; - int64_t _serialize_batch_ns = 0; int64_t _number_input_rows = 0; int64_t _number_output_rows = 0; int64_t _number_filtered_rows = 0; @@ -502,6 +519,11 @@ class OlapTableSink : public DataSink { FIND_TABLET_EVERY_ROW, FIND_TABLET_EVERY_BATCH, FIND_TABLET_EVERY_SINK }; FindTabletMode findTabletMode = FindTabletMode::FIND_TABLET_EVERY_ROW; + +private: + OlapTablePartitionParam* _partition = nullptr; + std::vector _output_expr_ctxs; + std::unique_ptr _output_batch; }; } // namespace stream_load diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 2ab6ab1b921816..f3f885ab76fda7 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -102,6 +102,9 @@ void MemTable::insert(const Tuple* tuple) { _agg_buffer_pool.clear(); } +void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t& num_rows) { + // TODO: +} void MemTable::_tuple_to_row(const Tuple* tuple, ContiguousRow* row, MemPool* mem_pool) { for (size_t i = 0; i < _slot_descs->size(); ++i) { auto cell = row->cell(i); diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 0d483eaa4cb31e..4d347c16bb270d 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -65,6 +65,7 @@ #include "util/priority_thread_pool.hpp" #include "util/priority_work_stealing_thread_pool.hpp" #include "vec/runtime/vdata_stream_mgr.h" +#include "vec/runtime/vload_channel_mgr.h" namespace doris { @@ -142,7 +143,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _tmp_file_mgr = new TmpFileMgr(this); _bfd_parser = BfdParser::create(); _broker_mgr = new BrokerMgr(this); - _load_channel_mgr = new LoadChannelMgr(); + _load_channel_mgr = new vectorized::VLoadChannelMgr(); _load_stream_mgr = new LoadStreamMgr(); _internal_client_cache = new BrpcClientCache(); _function_client_cache = new BrpcClientCache(); diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index ccbde602fe883b..e614490f2647ff 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -96,6 +96,8 @@ set(VEC_FILES exec/vassert_num_rows_node.cpp exec/vrepeat_node.cpp exec/vtable_function_node.cpp + exec/vbroker_scan_node.cpp + exec/vbroker_scanner.cpp exec/join/vhash_join_node.cpp exprs/vectorized_agg_fn.cpp exprs/vectorized_fn_call.cpp diff --git a/be/src/vec/exec/vbroker_scan_node.cpp b/be/src/vec/exec/vbroker_scan_node.cpp new file mode 100644 index 00000000000000..2054d100ef023a --- /dev/null +++ b/be/src/vec/exec/vbroker_scan_node.cpp @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/vbroker_scan_node.h" + +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/runtime_state.h" +#include "runtime/string_value.h" +#include "runtime/tuple.h" +#include "runtime/tuple_row.h" +#include "runtime/mem_tracker.h" +#include "util/runtime_profile.h" +#include "util/types.h" +#include "vec/exprs/vexpr_context.h" + +namespace doris::vectorized { + +VBrokerScanNode::VBrokerScanNode(ObjectPool* pool, const TPlanNode& tnode, + const DescriptorTbl& descs) + : BrokerScanNode(pool, tnode, descs) { + _vectorized = true; +} + +VBrokerScanNode::~VBrokerScanNode() {} + +Status VBrokerScanNode::start_scanners() { + { + std::unique_lock l(_batch_queue_lock); + _num_running_scanners = 1; + } + _scanner_threads.emplace_back(&VBrokerScanNode::scanner_worker, this, 0, _scan_ranges.size()); + return Status::OK(); +} + +Status VBrokerScanNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { + SCOPED_TIMER(_runtime_profile->total_time_counter()); + // check if CANCELLED. + if (state->is_cancelled()) { + std::unique_lock l(_batch_queue_lock); + if (update_status(Status::Cancelled("Cancelled"))) { + // Notify all scanners + _queue_writer_cond.notify_all(); + } + } + + if (_scan_finished.load()) { + *eos = true; + return Status::OK(); + } + + std::shared_ptr scanner_block; + { + std::unique_lock l(_batch_queue_lock); + while (_process_status.ok() && !_runtime_state->is_cancelled() && + _num_running_scanners > 0 && _block_queue.empty()) { + SCOPED_TIMER(_wait_scanner_timer); + _queue_reader_cond.wait_for(l, std::chrono::seconds(1)); + } + if (!_process_status.ok()) { + // Some scanner process failed. + return _process_status; + } + if (_runtime_state->is_cancelled()) { + if (update_status(Status::Cancelled("Cancelled"))) { + _queue_writer_cond.notify_all(); + } + return _process_status; + } + if (!_block_queue.empty()) { + scanner_block = _block_queue.front(); + _block_queue.pop_front(); + } + } + + // All scanner has been finished, and all cached batch has been read + if (scanner_block == nullptr) { + _scan_finished.store(true); + *eos = true; + return Status::OK(); + } + + // notify one scanner + _queue_writer_cond.notify_one(); + + reached_limit(scanner_block.get(), eos); + *block = *scanner_block; + + if (*eos) { + _scan_finished.store(true); + _queue_writer_cond.notify_all(); + LOG(INFO) << "VBrokerScanNode ReachedLimit."; + } else { + *eos = false; + } + + return Status::OK(); +} + +Status VBrokerScanNode::close(RuntimeState* state) { + auto status = BrokerScanNode::close(state); + _block_queue.clear(); + return status; +} + +Status VBrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, + const std::vector& vconjunct_ctxs, + ScannerCounter* counter) { + //create scanner object and open + std::unique_ptr scanner = create_scanner(scan_range, counter); + RETURN_IF_ERROR(scanner->open()); + bool scanner_eof = false; + + const int batch_size = _runtime_state->batch_size(); + size_t slot_num = _tuple_desc->slots().size(); + + while (!scanner_eof) { + std::shared_ptr block(new vectorized::Block()); + std::vector columns(slot_num); + for (int i = 0; i < slot_num; i++) { + columns[i] = _tuple_desc->slots()[i]->get_empty_mutable_column(); + } + + while (columns[0]->size() < batch_size && !scanner_eof) { + RETURN_IF_CANCELLED(_runtime_state); + // If we have finished all works + if (_scan_finished.load()) { + return Status::OK(); + } + + RETURN_IF_ERROR(scanner->get_next(columns, &scanner_eof)); + if (scanner_eof) { + break; + } + } + + if (columns[0]->size() > 0) { + auto n_columns = 0; + for (const auto slot_desc : _tuple_desc->slots()) { + block->insert(ColumnWithTypeAndName(std::move(columns[n_columns++]), + slot_desc->get_data_type_ptr(), + slot_desc->col_name())); + } + + auto old_rows = block->rows(); + + RETURN_IF_ERROR(VExprContext::filter_block(_vconjunct_ctx_ptr, block.get(), + _tuple_desc->slots().size())); + + counter->num_rows_unselected += old_rows - block->rows(); + + std::unique_lock l(_batch_queue_lock); + while (_process_status.ok() && !_scan_finished.load() && + !_runtime_state->is_cancelled() && + // stop pushing more batch if + // 1. too many batches in queue, or + // 2. at least one batch in queue and memory exceed limit. + (_block_queue.size() >= _max_buffered_batches || + (mem_tracker()->AnyLimitExceeded(MemLimit::HARD) && !_block_queue.empty()))) { + _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); + } + // Process already set failed, so we just return OK + if (!_process_status.ok()) { + return Status::OK(); + } + // Scan already finished, just return + if (_scan_finished.load()) { + return Status::OK(); + } + // Runtime state is canceled, just return cancel + if (_runtime_state->is_cancelled()) { + return Status::Cancelled("Cancelled"); + } + // Queue size Must be smaller than _max_buffered_batches + _block_queue.push_back(block); + + // Notify reader to + _queue_reader_cond.notify_one(); + } + } + + return Status::OK(); +} + +void VBrokerScanNode::scanner_worker(int start_idx, int length) { + // Clone expr context + std::vector vscanner_expr_ctxs; + auto status = VExpr::clone_if_not_exists({*_vconjunct_ctx_ptr}, _runtime_state, &vscanner_expr_ctxs); + if (!status.ok()) { + LOG(WARNING) << "Clone conjuncts failed."; + } + + ScannerCounter counter; + for (int i = 0; i < length && status.ok(); ++i) { + const TBrokerScanRange& scan_range = + _scan_ranges[start_idx + i].scan_range.broker_scan_range; + status = scanner_scan(scan_range, vscanner_expr_ctxs, &counter); + if (!status.ok()) { + LOG(WARNING) << "Scanner[" << start_idx + i + << "] process failed. status=" << status.get_error_msg(); + } + } + + // Update stats + _runtime_state->update_num_rows_load_filtered(counter.num_rows_filtered); + _runtime_state->update_num_rows_load_unselected(counter.num_rows_unselected); + + // scanner is going to finish + { + std::lock_guard l(_batch_queue_lock); + if (!status.ok()) { + update_status(status); + } + // This scanner will finish + _num_running_scanners--; + } + _queue_reader_cond.notify_all(); + // If one scanner failed, others don't need scan any more + if (!status.ok()) { + _queue_writer_cond.notify_all(); + } + VExpr::close(vscanner_expr_ctxs, _runtime_state); +} + +} \ No newline at end of file diff --git a/be/src/vec/exec/vbroker_scan_node.h b/be/src/vec/exec/vbroker_scan_node.h new file mode 100644 index 00000000000000..384f792ddb11eb --- /dev/null +++ b/be/src/vec/exec/vbroker_scan_node.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "exec/broker_scan_node.h" +#include "exec/scan_node.h" +#include "runtime/descriptors.h" +//#include "vec/exec/vbroker_scanner.h" +namespace doris { + +class RuntimeState; +class Status; + +namespace vectorized { +class VBrokerScanNode : public BrokerScanNode { +public: + VBrokerScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); + virtual ~VBrokerScanNode(); + + virtual Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; + + // Close the scanner, and report errors. + virtual Status close(RuntimeState* state) override; + +private: + virtual Status start_scanners() override; + + void scanner_worker(int start_idx, int length); + // Scan one range + Status scanner_scan(const TBrokerScanRange& scan_range, + const std::vector& vconjunct_ctxs, ScannerCounter* counter); + + std::deque> _block_queue; +}; +} // namespace vectorized +} // namespace doris \ No newline at end of file diff --git a/be/src/vec/exec/vbroker_scanner.cpp b/be/src/vec/exec/vbroker_scanner.cpp new file mode 100644 index 00000000000000..7c7990990f9611 --- /dev/null +++ b/be/src/vec/exec/vbroker_scanner.cpp @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/vbroker_scanner.h" + +#include +#include +#include + +#include "exec/exec_node.h" +#include "exprs/expr_context.h" +#include "exec/plain_text_line_reader.h" + +namespace doris::vectorized { +VBrokerScanner::VBrokerScanner(RuntimeState* state, RuntimeProfile* profile, + const TBrokerScanRangeParams& params, + const std::vector& ranges, + const std::vector& broker_addresses, + const std::vector& pre_filter_texprs, + ScannerCounter* counter) + : BrokerScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs, counter) { + +} + +VBrokerScanner::~VBrokerScanner() { +} + +Status VBrokerScanner::get_next(std::vector& columns, bool* eof) { + SCOPED_TIMER(_read_timer); + // Get one line + while (!_scanner_eof) { + if (_cur_line_reader == nullptr || _cur_line_reader_eof) { + RETURN_IF_ERROR(open_next_reader()); + // If there isn't any more reader, break this + if (_scanner_eof) { + continue; + } + } + const uint8_t* ptr = nullptr; + size_t size = 0; + RETURN_IF_ERROR(_cur_line_reader->read_line(&ptr, &size, &_cur_line_reader_eof)); + if (_skip_next_line) { + _skip_next_line = false; + continue; + } + if (size == 0) { + // Read empty row, just continue + continue; + } + { + COUNTER_UPDATE(_rows_read_counter, 1); + SCOPED_TIMER(_materialize_timer); + RETURN_IF_ERROR(_convert_one_row(Slice(ptr, size), columns)); + if (_success) { + free_expr_local_allocations(); + } + break; // break always + } + } + if (_scanner_eof) { + *eof = true; + } else { + *eof = false; + } + return Status::OK(); +} + +Status VBrokerScanner::_convert_one_row(const Slice& line, std::vector& columns) { + RETURN_IF_ERROR(_line_to_src_tuple(line)); + if (!_success) { + // If not success, which means we met an invalid row, return. + return Status::OK(); + } + + return _fill_dest_columns(columns); +} + +Status VBrokerScanner::_fill_dest_columns(std::vector& columns) { + // filter src tuple by preceding filter first + if (!ExecNode::eval_conjuncts(&_pre_filter_ctxs[0], _pre_filter_ctxs.size(), _src_tuple_row)) { + _counter->num_rows_unselected++; + _success = false; + return Status::OK(); + } + // convert and fill dest tuple + int ctx_idx = 0; + for (auto slot_desc : _dest_tuple_desc->slots()) { + if (!slot_desc->is_materialized()) { + continue; + } + + int dest_index = ctx_idx++; + auto* column_ptr = columns[dest_index].get(); + + ExprContext* ctx = _dest_expr_ctx[dest_index]; + void* value = ctx->get_value(_src_tuple_row); + if (value == nullptr) { + // Only when the expr return value is null, we will check the error message. + std::string expr_error = ctx->get_error_msg(); + if (!expr_error.empty()) { + RETURN_IF_ERROR(_state->append_error_msg_to_file( + [&]() -> std::string { + return _src_tuple_row->to_string(*(_row_desc.get())); + }, + [&]() -> std::string { return expr_error; }, &_scanner_eof)); + _counter->num_rows_filtered++; + // The ctx is reused, so must clear the error state and message. + ctx->clear_error_msg(); + _success = false; + return Status::OK(); + } + // If _strict_mode is false, _src_slot_descs_order_by_dest size could be zero + if (_strict_mode && (_src_slot_descs_order_by_dest[dest_index] != nullptr) && + !_src_tuple->is_null( + _src_slot_descs_order_by_dest[dest_index]->null_indicator_offset())) { + RETURN_IF_ERROR(_state->append_error_msg_to_file( + [&]() -> std::string { + return _src_tuple_row->to_string(*(_row_desc.get())); + }, + [&]() -> std::string { + // Type of the slot is must be Varchar in _src_tuple. + StringValue* raw_value = _src_tuple->get_string_slot( + _src_slot_descs_order_by_dest[dest_index]->tuple_offset()); + std::string raw_string; + if (raw_value != nullptr) { //is not null then get raw value + raw_string = raw_value->to_string(); + } + fmt::memory_buffer error_msg; + fmt::format_to(error_msg, + "column({}) value is incorrect while strict mode is {}, " + "src value is {}", + slot_desc->col_name(), _strict_mode, raw_string); + return error_msg.data(); + }, + &_scanner_eof)); + _counter->num_rows_filtered++; + _success = false; + return Status::OK(); + } + if (!slot_desc->is_nullable()) { + RETURN_IF_ERROR(_state->append_error_msg_to_file( + [&]() -> std::string { + return _src_tuple_row->to_string(*(_row_desc.get())); + }, + [&]() -> std::string { + fmt::memory_buffer error_msg; + fmt::format_to( + error_msg, + "column({}) values is null while columns is not nullable", + slot_desc->col_name()); + return error_msg.data(); + }, + &_scanner_eof)); + _counter->num_rows_filtered++; + _success = false; + return Status::OK(); + } + auto* nullable_column = + reinterpret_cast(column_ptr); + nullable_column->insert_data(nullptr, 0); + continue; + } + if (slot_desc->is_nullable()) { + auto* nullable_column = + reinterpret_cast(column_ptr); + nullable_column->get_null_map_data().push_back(0); + column_ptr = &nullable_column->get_nested_column(); + } + char* value_ptr = (char*)value; + switch (slot_desc->type().type) { + case TYPE_BOOLEAN: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_TINYINT: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_SMALLINT: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_INT: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_BIGINT: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_LARGEINT: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_FLOAT: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_DOUBLE: { + assert_cast*>(column_ptr)->insert_data(value_ptr, 0); + break; + } + case TYPE_CHAR: { + Slice* slice = reinterpret_cast(value_ptr); + assert_cast(column_ptr) + ->insert_data(slice->data, strnlen(slice->data, slice->size)); + break; + } + case TYPE_VARCHAR: + case TYPE_STRING: { + Slice* slice = reinterpret_cast(value_ptr); + assert_cast(column_ptr)->insert_data(slice->data, slice->size); + break; + } + case TYPE_OBJECT: { + Slice* slice = reinterpret_cast(value_ptr); + // insert_default() + auto* target_column = assert_cast(column_ptr); + + target_column->insert_default(); + BitmapValue* pvalue = nullptr; + int pos = target_column->size() - 1; + pvalue = &target_column->get_element(pos); + + if (slice->size != 0) { + BitmapValue value; + value.deserialize(slice->data); + *pvalue = std::move(value); + } else { + *pvalue = std::move(*reinterpret_cast(slice->data)); + } + break; + } + case TYPE_HLL: { + Slice* slice = reinterpret_cast(value_ptr); + auto* target_column = assert_cast(column_ptr); + + target_column->insert_default(); + HyperLogLog* pvalue = nullptr; + int pos = target_column->size() - 1; + pvalue = &target_column->get_element(pos); + if (slice->size != 0) { + HyperLogLog value; + value.deserialize(*slice); + *pvalue = std::move(value); + } else { + *pvalue = std::move(*reinterpret_cast(slice->data)); + } + break; + } + case TYPE_DECIMALV2: { + assert_cast*>(column_ptr) + ->insert_data(reinterpret_cast(value_ptr), 0); + break; + } + case TYPE_DATETIME: { + uint64_t value = *reinterpret_cast(value_ptr); + VecDateTimeValue data(value); + assert_cast*>(column_ptr) + ->insert_data(reinterpret_cast(&data), 0); + break; + } + case TYPE_DATE: { + uint64_t value = *reinterpret_cast(value_ptr); + VecDateTimeValue date; + date.from_olap_date(value); + assert_cast*>(column_ptr) + ->insert_data(reinterpret_cast(&date), 0); + break; + } + default: { + break; + } + } + } + _success = true; + return Status::OK(); +} +} // namespace doris::vectorized diff --git a/be/src/vec/exec/vbroker_scanner.h b/be/src/vec/exec/vbroker_scanner.h new file mode 100644 index 00000000000000..b21a086f3e5080 --- /dev/null +++ b/be/src/vec/exec/vbroker_scanner.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + + +namespace doris::vectorized { +class VBrokerScanner : public BrokerScanner { +public: + VBrokerScanner(RuntimeState* state, RuntimeProfile* profile, + const TBrokerScanRangeParams& params, const std::vector& ranges, + const std::vector& broker_addresses, + const std::vector& pre_filter_texprs, ScannerCounter* counter); + virtual ~VBrokerScanner(); + + Status get_next(std::vector& columns, bool* eof) override; + +private: + Status _convert_one_row(const Slice& line, std::vector& columns); + Status _fill_dest_columns(std::vector& columns); + +}; +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index e8d3cc0f5f79ff..a3ace9da8abdf8 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -21,20 +21,307 @@ #include "vec/core/block.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" +#include "util/debug/sanitizer_scopes.h" +#include "util/time.h" +#include "util/proto_util.h" namespace doris { namespace stream_load { +VNodeChannel::VNodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id) + : NodeChannel(parent, index_channel, node_id) { + _is_vectorized = true; +} + +VNodeChannel::~VNodeChannel() { + _cur_add_block_request.release_id(); +} + +void VNodeChannel::clear_all_blocks() { + std::lock_guard lg(_pending_batches_lock); + std::queue empty; + std::swap(_pending_blocks, empty); + _cur_mutable_block.reset(); +} + +// if "_cancelled" is set to true, +// no need to set _cancel_msg because the error will be +// returned directly via "TabletSink::prepare()" method. +Status VNodeChannel::init(RuntimeState* state) { + RETURN_IF_ERROR(NodeChannel::init(state)); + + _cur_mutable_block.reset(new vectorized::MutableBlock({_tuple_desc})); + + // Initialize _cur_add_block_request + _cur_add_block_request.set_allocated_id(&_parent->_load_id); + _cur_add_block_request.set_index_id(_index_channel->_index_id); + _cur_add_block_request.set_sender_id(_parent->_sender_id); + _cur_add_block_request.set_backend_id(_node_id); + _cur_add_block_request.set_eos(false); + + _name = fmt::format("VNodeChannel[{}-{}]", _index_channel->_index_id, _node_id); + + return Status::OK(); +} + +Status VNodeChannel::open_wait() { + Status status = NodeChannel::open_wait(); + if (!status.ok()) { + return status; + } + + // add block closure + _add_block_closure = ReusableClosure::create(); + _add_block_closure->addFailedHandler([this](bool is_last_rpc) { + // If rpc failed, mark all tablets on this node channel as failed + _index_channel->mark_as_failed(this->node_id(), this->host(), _add_block_closure->cntl.ErrorText(), -1); + Status st = _index_channel->check_intolerable_failure(); + if (!st.ok()) { + _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); + } else if (is_last_rpc) { + // if this is last rpc, will must set _add_batches_finished. otherwise, node channel's close_wait + // will be blocked. + _add_batches_finished = true; + } + }); + + _add_block_closure->addSuccessHandler([this](const PTabletWriterAddBlockResult& result, + bool is_last_rpc) { + Status status(result.status()); + if (status.ok()) { + // if has error tablet, handle them first + for (auto& error : result.tablet_errors()) { + _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), error.tablet_id()); + } + + Status st = _index_channel->check_intolerable_failure(); + if (!st.ok()) { + _cancel_with_msg(st.get_error_msg()); + } else if (is_last_rpc) { + for (auto& tablet : result.tablet_vec()) { + TTabletCommitInfo commit_info; + commit_info.tabletId = tablet.tablet_id(); + commit_info.backendId = _node_id; + _tablet_commit_infos.emplace_back(std::move(commit_info)); + } + _add_batches_finished = true; + } + } else { + _cancel_with_msg(fmt::format("{}, add batch req success but status isn't ok, err: {}", + channel_info(), status.get_error_msg())); + } + + if (result.has_execution_time_us()) { + _add_batch_counter.add_batch_execution_time_us += result.execution_time_us(); + _add_batch_counter.add_batch_wait_execution_time_us += result.wait_execution_time_us(); + _add_batch_counter.add_batch_num++; + } + }); + return status; +} + +Status VNodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { + // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed. + auto st = none_of({_cancelled, _eos_is_produced}); + if (!st.ok()) { + if (_cancelled) { + std::lock_guard l(_cancel_msg_lock); + return Status::InternalError("add row failed. " + _cancel_msg); + } else { + return st.clone_and_prepend("already stopped, can't add row. cancelled/eos: "); + } + } + + // We use OlapTableSink mem_tracker which has the same ancestor of _plan node, + // so in the ideal case, mem limit is a matter for _plan node. + // But there is still some unfinished things, we do mem limit here temporarily. + // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. + // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). + while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && + _pending_batches_num > 0) { + SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + _cur_mutable_block->add_row(block_row.first, block_row.second); + _cur_add_block_request.add_tablet_ids(tablet_id); + + if (_cur_mutable_block->rows() == _batch_size) { + { + SCOPED_ATOMIC_TIMER(&_queue_push_lock_ns); + std::lock_guard l(_pending_batches_lock); + //To simplify the add_row logic, postpone adding block into req until the time of sending req + _pending_blocks.emplace(std::move(_cur_mutable_block), _cur_add_block_request); + _pending_batches_num++; + } + + _cur_mutable_block.reset(new vectorized::MutableBlock({_tuple_desc})); + _cur_add_block_request.clear_tablet_ids(); + } + + return Status::OK(); +} + +int VNodeChannel::try_send_and_fetch_status(std::unique_ptr& thread_pool_token) { + auto st = none_of({_cancelled, _send_finished}); + if (!st.ok()) { + return 0; + } + bool is_finished = true; + if (!_add_block_closure->is_packet_in_flight() && _pending_batches_num > 0 && + _last_patch_processed_finished.compare_exchange_strong(is_finished, false)) { + auto s = thread_pool_token->submit_func(std::bind(&VNodeChannel::try_send_block, this)); + if (!s.ok()) { + _cancel_with_msg("submit send_batch task to send_batch_thread_pool failed"); + } + } + return _send_finished ? 0 : 1; +} +void VNodeChannel::try_send_block() { + SCOPED_ATOMIC_TIMER(&_actual_consume_ns); + AddBlockReq send_block; + { + debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; + std::lock_guard l(_pending_batches_lock); + DCHECK(!_pending_blocks.empty()); + send_block = std::move(_pending_blocks.front()); + _pending_blocks.pop(); + _pending_batches_num--; + } + + auto mutable_block = std::move(send_block.first); + auto request = std::move(send_block.second); // doesn't need to be saved in heap + + // tablet_ids has already set when add row + request.set_packet_seq(_next_packet_seq); + auto block = mutable_block->to_block(); + if (block.rows() > 0) { + SCOPED_ATOMIC_TIMER(&_serialize_batch_ns); + size_t uncompressed_bytes = 0, compressed_bytes = 0; + Status st = block.serialize(request.mutable_block(), &uncompressed_bytes, + &compressed_bytes, &_column_values_buffer); + if (!st.ok()) { + cancel(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); + return; + } + if (compressed_bytes >= double(config::brpc_max_body_size) * 0.95f) { + LOG(WARNING) << "send block too large, this rpc may failed. send size: " + << compressed_bytes << ", threshold: " << config::brpc_max_body_size + << ", " << channel_info(); + } + } + + _add_block_closure->reset(); + int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS; + if (UNLIKELY(remain_ms < config::min_load_rpc_timeout_ms)) { + if (remain_ms <= 0 && !request.eos()) { + cancel(fmt::format("{}, err: timeout", channel_info())); + return; + } else { + remain_ms = config::min_load_rpc_timeout_ms; + } + } + _add_block_closure->cntl.set_timeout_ms(remain_ms); + if (config::tablet_writer_ignore_eovercrowded) { + _add_block_closure->cntl.ignore_eovercrowded(); + } + + if (request.eos()) { + for (auto pid : _parent->_partition_ids) { + request.add_partition_ids(pid); + } + + // eos request must be the last request + _add_block_closure->end_mark(); + _send_finished = true; + CHECK(_pending_batches_num == 0) << _pending_batches_num; + } + + if (request.has_block()) { + request_block_transfer_attachment>( + &request, _column_values_buffer, _add_block_closure); + } + _add_block_closure->set_in_flight(); + _stub->tablet_writer_add_block(&_add_block_closure->cntl, &request, &_add_block_closure->result, + _add_block_closure); + + _next_packet_seq++; + _last_patch_processed_finished = true; +} + +void VNodeChannel::_close_check() { + std::lock_guard lg(_pending_batches_lock); + CHECK(_pending_blocks.empty()) << name(); + CHECK(_cur_mutable_block == nullptr) << name(); +} + +Status VNodeChannel::mark_close() { + auto st = none_of({_cancelled, _eos_is_produced}); + if (!st.ok()) { + if (_cancelled) { + std::lock_guard l(_cancel_msg_lock); + return Status::InternalError("mark close failed. " + _cancel_msg); + } else { + return st.clone_and_prepend("already stopped, can't mark as closed. cancelled/eos: "); + } + } + + _cur_add_block_request.set_eos(true); + { + debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; + std::lock_guard l(_pending_batches_lock); + _pending_blocks.emplace(std::move(_cur_mutable_block), _cur_add_block_request); + _pending_batches_num++; + DCHECK(_pending_blocks.back().second.eos()); + _close_time_ms = UnixMillis(); + LOG(INFO) << channel_info() + << " mark closed, left pending batch size: " << _pending_blocks.size(); + } + + _eos_is_produced = true; + return Status::OK(); +} + +VIndexChannel::VIndexChannel(OlapTableSink* parent, int64_t index_id) + : IndexChannel(parent, index_id) { + _is_vectorized = true; +} + +VIndexChannel::~VIndexChannel() {} + +void VIndexChannel::add_row(BlockRow& block_row, int64_t tablet_id) { + auto it = _channels_by_tablet.find(tablet_id); + DCHECK(it != _channels_by_tablet.end()) << "unknown tablet, tablet_id=" << tablet_id; + for (auto channel : it->second) { + // if this node channel is already failed, this add_row will be skipped + auto st = channel->add_row(block_row, tablet_id); + if (!st.ok()) { + mark_as_failed(channel->node_id(), channel->host(), st.get_error_msg(), tablet_id); + } + } +} + VOlapTableSink::VOlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector& texprs, Status* status) : OlapTableSink(pool, row_desc, texprs, status) { + + _is_vectorized = true; + // From the thrift expressions create the real exprs. vectorized::VExpr::create_expr_trees(pool, texprs, &_output_vexpr_ctxs); - // Do not use the origin data scala expr, clear scala expr contexts - _output_expr_ctxs.clear(); _name = "VOlapTableSink"; } +VOlapTableSink::~VOlapTableSink() { + // We clear NodeChannels' batches here, cuz NodeChannels' batches destruction will use + // OlapTableSink::_mem_tracker and its parents. + // But their destructions are after OlapTableSink's. + for (auto index_channel : _channels) { + index_channel->for_each_node_channel([](const std::shared_ptr& ch) { ch->clear_all_blocks(); }); + } +} + Status VOlapTableSink::init(const TDataSink& sink) { RETURN_IF_ERROR(OlapTableSink::init(sink)); _vpartition = _pool->add(new VOlapTablePartitionParam(_schema, sink.olap_table_sink.partition)); diff --git a/be/src/vec/sink/vtablet_sink.h b/be/src/vec/sink/vtablet_sink.h index 5514ff190945eb..43528a2e86e7d2 100644 --- a/be/src/vec/sink/vtablet_sink.h +++ b/be/src/vec/sink/vtablet_sink.h @@ -28,6 +28,58 @@ class VExprContext; namespace stream_load { +class VIndexChannel; +class VNodeChannel : public NodeChannel { +public: + VNodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id); + + ~VNodeChannel() override; + + Status init(RuntimeState* state) override; + + Status open_wait() override; + + Status add_row(BlockRow& block_row, int64_t tablet_id) override; + + int try_send_and_fetch_status(std::unique_ptr& thread_pool_token) override; + + void try_send_block(); + + void clear_all_blocks() override; + + // two ways to stop channel: + // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response. + // 2. just cancel() + Status mark_close() override; + +protected: + void _close_check() override; + +private: + std::unique_ptr _cur_mutable_block; + PTabletWriterAddBlockRequest _cur_add_block_request; + + using AddBlockReq = std::pair, PTabletWriterAddBlockRequest>; + std::queue _pending_blocks; + ReusableClosure* _add_block_closure = nullptr; + + // This buffer is used to store the serialized block data + // The data in the buffer is copied to the attachment of the brpc when it is sent, + // to avoid an extra pb serialization in the brpc. + std::string _column_values_buffer; + +}; + +class VIndexChannel : public IndexChannel { +public: + VIndexChannel(OlapTableSink* parent, int64_t index_id); + + ~VIndexChannel() override; + + void add_row(BlockRow& block_row, int64_t tablet_id) override; + +}; + class OlapTableSink; // Write block data to Olap Table. @@ -40,6 +92,8 @@ class VOlapTableSink : public OlapTableSink { VOlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector& texprs, Status* status); + ~VOlapTableSink() override; + Status init(const TDataSink& sink) override; // TODO: unify the code of prepare/open/close with result sink Status prepare(RuntimeState* state) override; diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java index a454647b1cd031..761cbf9fcec05d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java @@ -140,6 +140,7 @@ public TExecPlanFragmentParams plan(TUniqueId loadId) throws UserException { scanNode.init(analyzer); descTable.computeStatAndMemLayout(); scanNode.finalize(analyzer); + scanNode.convertToVectoriezd(); int timeout = taskInfo.getTimeout(); if (taskInfo instanceof RoutineLoadJob) { @@ -191,6 +192,7 @@ public TExecPlanFragmentParams plan(TUniqueId loadId) throws UserException { queryOptions.setMemLimit(taskInfo.getMemLimit()); // for stream load, we use exec_mem_limit to limit the memory usage of load channel. queryOptions.setLoadMemLimit(taskInfo.getMemLimit()); + queryOptions.setEnableVectorizedEngine(true); params.setQueryOptions(queryOptions); TQueryGlobals queryGlobals = new TQueryGlobals(); queryGlobals.setNowString(DATE_FORMAT.format(new Date())); From fe3f7716c1034d4dc4b15579aec48377634df388 Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Wed, 9 Mar 2022 23:55:50 +0800 Subject: [PATCH 05/32] =?UTF-8?q?=E9=81=BF=E5=85=8D=20VDeltaWriter=20?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=20overload=20=E9=80=A0=E6=88=90=E5=9F=BA?= =?UTF-8?q?=E7=B1=BB=E7=9A=84=E6=96=B9=E6=B3=95=E8=A2=AB=E9=9A=90=E8=97=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- be/src/olap/delta_writer.h | 2 +- be/src/vec/olap/vdelta_writer.cpp | 2 +- be/src/vec/olap/vdelta_writer.h | 2 +- be/src/vec/runtime/vtablets_channel.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index f06157ea63c61e..6c25d5fd2e5d25 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -62,7 +62,7 @@ class DeltaWriter { OLAPStatus write(Tuple* tuple); OLAPStatus write(const RowBatch* row_batch, const std::vector& row_idxs); - virtual OLAPStatus write(const vectorized::Block* block, const std::vector& row_idxs) { + virtual OLAPStatus write_block(const vectorized::Block* block, const std::vector& row_idxs) { return OLAP_ERR_READER_INITIALIZE_ERROR; } diff --git a/be/src/vec/olap/vdelta_writer.cpp b/be/src/vec/olap/vdelta_writer.cpp index 519cf489126f7c..b46954391798f0 100644 --- a/be/src/vec/olap/vdelta_writer.cpp +++ b/be/src/vec/olap/vdelta_writer.cpp @@ -37,7 +37,7 @@ OLAPStatus VDeltaWriter::open(WriteRequest* req, const std::shared_ptr& row_idxs) { +OLAPStatus VDeltaWriter::write_block(const vectorized::Block* block, const std::vector& row_idxs) { if (UNLIKELY(row_idxs.empty())) { return OLAP_SUCCESS; } diff --git a/be/src/vec/olap/vdelta_writer.h b/be/src/vec/olap/vdelta_writer.h index a3c8eb4e9f7d3e..6716c429a0782f 100644 --- a/be/src/vec/olap/vdelta_writer.h +++ b/be/src/vec/olap/vdelta_writer.h @@ -30,7 +30,7 @@ class VDeltaWriter : public DeltaWriter { static OLAPStatus open(WriteRequest* req, const std::shared_ptr& parent, VDeltaWriter** writer); - virtual OLAPStatus write(const vectorized::Block* block, const std::vector& row_idxs) override; + virtual OLAPStatus write_block(const vectorized::Block* block, const std::vector& row_idxs) override; private: VDeltaWriter(WriteRequest* req, const std::shared_ptr& parent, diff --git a/be/src/vec/runtime/vtablets_channel.cpp b/be/src/vec/runtime/vtablets_channel.cpp index 275f68a76533ab..1840b0d2be18d7 100644 --- a/be/src/vec/runtime/vtablets_channel.cpp +++ b/be/src/vec/runtime/vtablets_channel.cpp @@ -117,7 +117,7 @@ Status VTabletsChannel::add_block(const PTabletWriterAddBlockRequest& request, strings::Substitute("unknown tablet to append data, tablet=$0", tablet_to_rowidxs_it.first)); } - OLAPStatus st = tablet_writer_it->second->write(&block, tablet_to_rowidxs_it.second); + OLAPStatus st = tablet_writer_it->second->write_block(&block, tablet_to_rowidxs_it.second); if (st != OLAP_SUCCESS) { auto err_msg = strings::Substitute( "tablet writer write failed, tablet_id=$0, txn_id=$1, err=$2", From 3533efafc524dd99beaba2f4ed5a1a4f4ed5ae21 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Wed, 9 Mar 2022 18:57:44 +0800 Subject: [PATCH 06/32] memtable vectorization part 1 --- be/src/olap/memtable.cpp | 84 +++++++++++++++++++++++++++++++++++---- be/src/olap/memtable.h | 36 ++++++++++++++++- be/src/vec/core/block.cpp | 9 +++++ be/src/vec/core/block.h | 24 +++++++++++ 4 files changed, 145 insertions(+), 8 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index f3f885ab76fda7..820c5d9b8cb64e 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -42,14 +42,20 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _table_mem_pool(new MemPool(_mem_tracker.get())), _schema_size(_schema->schema_size()), _rowset_writer(rowset_writer) { - if (tablet_schema->sort_type() == SortType::ZORDER) { - _row_comparator = - std::make_shared(_schema, tablet_schema->sort_col_num()); - } else { - _row_comparator = std::make_shared(_schema); + if (config::enable_storage_vectorization){ + _vec_row_comparator = std::make_shared(_schema); + _vec_skip_list = new VecTable(_vec_row_comparator.get(), _table_mem_pool.get(), + _keys_type == KeysType::DUP_KEYS); + }else{ + if (tablet_schema->sort_type() == SortType::ZORDER) { + _row_comparator = + std::make_shared(_schema, tablet_schema->sort_col_num()); + } else { + _row_comparator = std::make_shared(_schema); + } + _skip_list = new Table(_row_comparator.get(), _table_mem_pool.get(), + _keys_type == KeysType::DUP_KEYS); } - _skip_list = new Table(_row_comparator.get(), _table_mem_pool.get(), - _keys_type == KeysType::DUP_KEYS); } MemTable::~MemTable() { @@ -64,6 +70,48 @@ int MemTable::RowCursorComparator::operator()(const char* left, const char* righ return compare_row(lhs_row, rhs_row); } +int MemTable::VecRowComparator::operator()(const RowInBlock left, const RowInBlock right) const{ + return left._block->compare_at(left._row_pos, right._row_pos, + _schema->num_key_columns(), + *(right._block), -1); + //nan_direction_hint == -1, NaN and NULLs are considered as least than everything other; +} + +void MemTable::insert(const vectorized::Block* block, const size_t row_pos, const size_t num_rows) +{ + if (_mutableBlock.columns() == 0) + { + auto cloneBlock = block->clone_without_columns(); + _mutableBlock = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + } + size_t cursor_in_mutableblock = _mutableBlock.rows(); + _mutableBlock.add_rows(block, row_pos, num_rows); + for(int i = 0; i < num_rows; i++){ + + + + + insert_one_row_from_block(RowInBlock(&_mutableBlock, cursor_in_mutableblock + i)); + } +} +void MemTable::insert_one_row_from_block(struct RowInBlock row_in_block) +{ + _rows++; + bool overwritten = false; + if (_keys_type == KeysType::DUP_KEYS) + { + _vec_skip_list->Insert(row_in_block, &overwritten); + DCHECK(!overwritten) << "Duplicate key model meet overwrite in SkipList"; + return; + } + bool is_exist = _vec_skip_list->Find(row_in_block, &_vec_hint); + if (is_exist){ + _aggregate_two_rowInBlock(row_in_block, _vec_hint.curr->key); + }else{ + _vec_skip_list->InsertWithHint(row_in_block, is_exist, &_vec_hint); + } +} + void MemTable::insert(const Tuple* tuple) { _rows++; bool overwritten = false; @@ -126,6 +174,28 @@ void MemTable::_aggregate_two_row(const ContiguousRow& src_row, TableKey row_in_ } } +void MemTable::_aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_skiplist){ + if (_tablet_schema->has_sequence_col()) { + auto sequence_idx = _tablet_schema->sequence_col_idx(); + auto seq_dst_cell = row_in_skiplist.cell(sequence_idx); + auto seq_src_cell = new_row.cell(sequence_idx); + auto res = _schema->column(sequence_idx)->compare_cell(seq_dst_cell, seq_src_cell); + // dst sequence column larger than src, don't need to update + if (res > 0) { + return; + } + } + + + for (uint32_t cid = _schema->num_key_columns(); + cid < _schema->num_columns(); + ++cid) { + auto dst_cell = row_in_skiplist.cell(cid); + auto src_cell = new_row.cell(cid); + _schema->column(cid)->agg_update(&dst_cell, &src_cell, _table_mem_pool.get()); + } + +} OLAPStatus MemTable::flush() { VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id << ", memsize: " << memory_usage() << ", rows: " << _rows; diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 85b709b0d65c5e..3b2778c59fc212 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -26,7 +26,7 @@ #include "runtime/mem_tracker.h" #include "util/tuple_row_zorder_compare.h" #include "vec/core/block.h" - +#include "vec/common/string_ref.h" namespace doris { struct ContiguousRow; @@ -49,6 +49,7 @@ class MemTable { std::shared_ptr mem_tracker() { return _mem_tracker; } void insert(const Tuple* tuple); void insert(const vectorized::Block* block, size_t row_pos, size_t& num_rows); + void insert(const vectorized::Block* block, const size_t row_pos, const size_t num_rows);//insert tuple from (row_pos) to (row_pos+num_rows) /// Flush OLAPStatus flush(); OLAPStatus close(); @@ -65,10 +66,30 @@ class MemTable { const Schema* _schema; }; + struct RowInBlock{ + vectorized::MutableBlock* _block; + size_t _row_pos; + RowInBlock(int i):_block(0), _row_pos(0){} //this constructor is for SkipList::NewNode(0, ...) + RowInBlock(vectorized::MutableBlock* block, size_t row_pos):_block(block), _row_pos(row_pos){} + RowCursorCell cell(int cid){ + StringRef ref = _block->mutable_columns()[cid]->get_data_at(_row_pos); + return RowCursorCell(ref.data); + } + }; + class VecRowComparator { + public: + VecRowComparator(const Schema* schema):_schema(schema){}; + int operator()(const RowInBlock left, const RowInBlock right) const; + private: + const Schema* _schema; + }; + private: typedef SkipList Table; typedef Table::key_type TableKey; + typedef SkipList VecTable; + public: /// The iterator of memtable, so that the data in this memtable /// can be visited outside. @@ -90,6 +111,9 @@ class MemTable { private: void _tuple_to_row(const Tuple* tuple, ContiguousRow* row, MemPool* mem_pool); void _aggregate_two_row(const ContiguousRow& new_row, TableKey row_in_skiplist); + //for vectorized + void insert_one_row_from_block(struct RowInBlock row_in_block); + void _aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_skiplist); int64_t _tablet_id; Schema* _schema; @@ -99,6 +123,9 @@ class MemTable { KeysType _keys_type; std::shared_ptr _row_comparator; + + std::shared_ptr _vec_row_comparator; + std::shared_ptr _mem_tracker; // This is a buffer, to hold the memory referenced by the rows that have not // been inserted into the SkipList @@ -117,6 +144,9 @@ class MemTable { Table* _skip_list; Table::Hint _hint; + VecTable* _vec_skip_list; + VecTable::Hint _vec_hint; + RowsetWriter* _rowset_writer; // the data size flushed on disk of this memtable @@ -126,6 +156,10 @@ class MemTable { // in unique or aggragate key model. int64_t _rows = 0; + //for vectorized + vectorized::MutableBlock _mutableBlock; + + }; // class MemTable inline std::ostream& operator<<(std::ostream& os, const MemTable& table) { diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index ca44be9e8f487a..61814f81050433 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -853,6 +853,15 @@ void MutableBlock::add_rows(const Block* block, const int* row_begin, const int* } } +void MutableBlock::add_rows(const Block* block, size_t row_begin, size_t row_end) { + auto& block_data = block->get_columns_with_type_and_name(); + for (size_t i = 0; i < _columns.size(); ++i) { + auto& dst = _columns[i]; + auto& src = *block_data[i].column.get(); + dst->insert_range_from(src, row_begin, row_end); + } +} + Block MutableBlock::to_block(int start_column) { return to_block(start_column, _columns.size()); } diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index 8fa4a3004ae9be..fdbf63db215cc1 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -349,6 +349,29 @@ class MutableBlock { DataTypes& data_types() { return _data_types; } + MutableColumnPtr& get_column_by_position(size_t position) { return _columns[position]; } + const MutableColumnPtr& get_column_by_position(size_t position) const { return _columns[position]; } + + DataTypePtr& get_datatype_by_position(size_t position) { return _data_types[position]; } + const DataTypePtr& get_datatype_by_position(size_t position) const { return _data_types[position]; } + + int compare_at(size_t n, size_t m, size_t num_columns, const MutableBlock& rhs, + int nan_direction_hint) const { + DCHECK_GE(columns(), num_columns); + DCHECK_GE(rhs.columns(), num_columns); + + DCHECK_LE(n, rows()); + DCHECK_LE(m, rhs.rows()); + for (size_t i = 0; i < num_columns; ++i) { + DCHECK(get_datatype_by_position(i)->equals(*rhs.get_datatype_by_position(i))); + auto res = get_column_by_position(i)->compare_at(n, m, *(rhs.get_column_by_position(i)), + nan_direction_hint); + if (res) { + return res; + } + } + return 0; + } template void merge(T&& block) { if (_columns.size() == 0 && _data_types.size() == 0) { @@ -392,6 +415,7 @@ class MutableBlock { void add_row(const Block* block, int row); void add_rows(const Block* block, const int* row_begin, const int* row_end); + void add_rows(const Block* block, size_t row_begin, size_t row_end); std::string dump_data(size_t row_limit = 100) const; From c542823ba6451106fda4ffe5083ccc7675ed9088 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Thu, 10 Mar 2022 10:25:22 +0800 Subject: [PATCH 07/32] merge memtable.insert api --- be/src/olap/memtable.cpp | 13 +++---------- be/src/olap/memtable.h | 3 ++- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 820c5d9b8cb64e..d2fec2ee32b676 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -77,7 +77,7 @@ int MemTable::VecRowComparator::operator()(const RowInBlock left, const RowInBlo //nan_direction_hint == -1, NaN and NULLs are considered as least than everything other; } -void MemTable::insert(const vectorized::Block* block, const size_t row_pos, const size_t num_rows) +void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num_rows) { if (_mutableBlock.columns() == 0) { @@ -85,12 +85,8 @@ void MemTable::insert(const vectorized::Block* block, const size_t row_pos, cons _mutableBlock = vectorized::MutableBlock::build_mutable_block(&cloneBlock); } size_t cursor_in_mutableblock = _mutableBlock.rows(); - _mutableBlock.add_rows(block, row_pos, num_rows); - for(int i = 0; i < num_rows; i++){ - - - - + _mutableBlock.add_rows(block, row_pos, row_pos + num_rows); + for(int i = 0; i < num_rows; i++){ insert_one_row_from_block(RowInBlock(&_mutableBlock, cursor_in_mutableblock + i)); } } @@ -150,9 +146,6 @@ void MemTable::insert(const Tuple* tuple) { _agg_buffer_pool.clear(); } -void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t& num_rows) { - // TODO: -} void MemTable::_tuple_to_row(const Tuple* tuple, ContiguousRow* row, MemPool* mem_pool) { for (size_t i = 0; i < _slot_descs->size(); ++i) { auto cell = row->cell(i); diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 3b2778c59fc212..236f8b3f4aebb7 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -48,8 +48,9 @@ class MemTable { size_t memory_usage() const { return _mem_tracker->consumption(); } std::shared_ptr mem_tracker() { return _mem_tracker; } void insert(const Tuple* tuple); + //insert tuple from (row_pos) to (row_pos+num_rows) void insert(const vectorized::Block* block, size_t row_pos, size_t& num_rows); - void insert(const vectorized::Block* block, const size_t row_pos, const size_t num_rows);//insert tuple from (row_pos) to (row_pos+num_rows) + /// Flush OLAPStatus flush(); OLAPStatus close(); From 60053abb7e9ddacdd4932e8a5233bb29fdc66491 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Thu, 10 Mar 2022 16:01:51 +0800 Subject: [PATCH 08/32] memtable flush support vectorization --- be/src/olap/memtable.cpp | 43 +++++++++++++++++++---- be/src/olap/memtable.h | 56 +++++++++++++++++++----------- be/src/olap/rowset/rowset_writer.h | 4 ++- 3 files changed, 75 insertions(+), 28 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index d2fec2ee32b676..539248fdb5c78e 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -70,7 +70,7 @@ int MemTable::RowCursorComparator::operator()(const char* left, const char* righ return compare_row(lhs_row, rhs_row); } -int MemTable::VecRowComparator::operator()(const RowInBlock left, const RowInBlock right) const{ +int VecRowComparator::operator()(const RowInBlock left, const RowInBlock right) const{ return left._block->compare_at(left._row_pos, right._row_pos, _schema->num_key_columns(), *(right._block), -1); @@ -79,17 +79,19 @@ int MemTable::VecRowComparator::operator()(const RowInBlock left, const RowInBlo void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num_rows) { - if (_mutableBlock.columns() == 0) + if (_input_mutable_block.columns() == 0) { auto cloneBlock = block->clone_without_columns(); - _mutableBlock = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + _input_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + _output_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); } - size_t cursor_in_mutableblock = _mutableBlock.rows(); - _mutableBlock.add_rows(block, row_pos, row_pos + num_rows); + size_t cursor_in_mutableblock = _input_mutable_block.rows(); + _input_mutable_block.add_rows(block, row_pos, row_pos + num_rows); for(int i = 0; i < num_rows; i++){ - insert_one_row_from_block(RowInBlock(&_mutableBlock, cursor_in_mutableblock + i)); + insert_one_row_from_block(RowInBlock(&_input_mutable_block, cursor_in_mutableblock + i)); } } + void MemTable::insert_one_row_from_block(struct RowInBlock row_in_block) { _rows++; @@ -189,6 +191,35 @@ void MemTable::_aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_s } } +vectorized::Block MemTable::to_block() +{ + VecTable::Iterator it(_vec_skip_list); + vectorized::Block in_block = _input_mutable_block.to_block(); + for (it.SeekToFirst(); it.Valid(); it.Next()) { + _output_mutable_block.add_row(&in_block, it.key()._row_pos); + } + return _output_mutable_block.to_block(); +} + +OLAPStatus MemTable::vflush(){ + VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id + << ", memsize: " << memory_usage() << ", rows: " << _rows; + int64_t duration_ns = 0; + { + SCOPED_RAW_TIMER(&duration_ns); + vectorized::Block block = to_block(); + OLAPStatus st = _rowset_writer->add_block(&block); + RETURN_NOT_OK(st); + } + DorisMetrics::instance()->memtable_flush_total->increment(1); + DorisMetrics::instance()->memtable_flush_duration_us->increment(duration_ns / 1000); + + return OLAP_SUCCESS; +} +OLAPStatus MemTable::vclose() { + return vflush(); +} + OLAPStatus MemTable::flush() { VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id << ", memsize: " << memory_usage() << ", rows: " << _rows; diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 236f8b3f4aebb7..2af08a667d15f5 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -36,7 +36,8 @@ class SlotDescriptor; class TabletSchema; class Tuple; class TupleDescriptor; - +class RowInBlock; +class VecRowComparator; class MemTable { public: MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, @@ -49,7 +50,7 @@ class MemTable { std::shared_ptr mem_tracker() { return _mem_tracker; } void insert(const Tuple* tuple); //insert tuple from (row_pos) to (row_pos+num_rows) - void insert(const vectorized::Block* block, size_t row_pos, size_t& num_rows); + void insert(const vectorized::Block* block, size_t row_pos, size_t num_rows); /// Flush OLAPStatus flush(); @@ -57,6 +58,9 @@ class MemTable { int64_t flush_size() const { return _flush_size; } + //flush for vectorized + OLAPStatus vflush(); + OLAPStatus vclose(); private: class RowCursorComparator : public RowComparator { public: @@ -67,23 +71,6 @@ class MemTable { const Schema* _schema; }; - struct RowInBlock{ - vectorized::MutableBlock* _block; - size_t _row_pos; - RowInBlock(int i):_block(0), _row_pos(0){} //this constructor is for SkipList::NewNode(0, ...) - RowInBlock(vectorized::MutableBlock* block, size_t row_pos):_block(block), _row_pos(row_pos){} - RowCursorCell cell(int cid){ - StringRef ref = _block->mutable_columns()[cid]->get_data_at(_row_pos); - return RowCursorCell(ref.data); - } - }; - class VecRowComparator { - public: - VecRowComparator(const Schema* schema):_schema(schema){}; - int operator()(const RowInBlock left, const RowInBlock right) const; - private: - const Schema* _schema; - }; private: typedef SkipList Table; @@ -109,6 +96,7 @@ class MemTable { Table::Iterator _it; }; + private: void _tuple_to_row(const Tuple* tuple, ContiguousRow* row, MemPool* mem_pool); void _aggregate_two_row(const ContiguousRow& new_row, TableKey row_in_skiplist); @@ -158,11 +146,37 @@ class MemTable { int64_t _rows = 0; //for vectorized - vectorized::MutableBlock _mutableBlock; - + vectorized::MutableBlock _input_mutable_block; + vectorized::MutableBlock _output_mutable_block; + vectorized::Block to_block(); }; // class MemTable + +struct RowInBlock{ + vectorized::MutableBlock* _block; + size_t _row_pos; + RowInBlock(int i):_block(nullptr), _row_pos(0){} //this constructor is for SkipList::NewNode(0, ...) + RowInBlock(vectorized::MutableBlock* block, size_t row_pos):_block(block), _row_pos(row_pos){} + RowCursorCell cell(int cid){ + StringRef ref = _block->mutable_columns()[cid]->get_data_at(_row_pos); + return RowCursorCell(ref.data); + } +}; +class VecRowComparator { +public: + VecRowComparator(const Schema* schema):_schema(schema){}; + int operator()(const RowInBlock left, const RowInBlock right) const; +private: + const Schema* _schema; +}; + +template <> +inline bool SkipList::Iterator::Valid() const { + return node_->key._block != nullptr; +} + + inline std::ostream& operator<<(std::ostream& os, const MemTable& table) { os << "MemTable(addr=" << &table << ", tablet=" << table.tablet_id() << ", mem=" << table.memory_usage(); diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index a36f0f8ceb5dce..fcbbdd0e2d18a6 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -57,7 +57,9 @@ class RowsetWriter { virtual OLAPStatus flush_single_memtable(MemTable* memtable, int64_t* flush_size) { return OLAP_ERR_FUNC_NOT_IMPLEMENTED; } - + + virtual OLAPStatus add_block(const vectorized::Block* block) { + return OLAP_ERR_FUNC_NOT_IMPLEMENTED; } // finish building and return pointer to the built rowset (guaranteed to be inited). // return nullptr when failed virtual RowsetSharedPtr build() = 0; From e1bfc69dce52e3b68bece0ac468da013d7bb6394 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Thu, 10 Mar 2022 19:40:33 +0800 Subject: [PATCH 09/32] add mem_tracker for vec memtable --- be/src/olap/memtable.cpp | 21 +++++++++++++++------ be/src/olap/memtable.h | 8 +++++--- be/src/vec/core/block.cpp | 9 +++++++++ be/src/vec/core/block.h | 1 + 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 539248fdb5c78e..7f89b4a60aa5f6 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -28,10 +28,10 @@ #include "util/doris_metrics.h" namespace doris { - MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, - KeysType keys_type, RowsetWriter* rowset_writer) + KeysType keys_type, RowsetWriter* rowset_writer, + bool support_vec) : _tablet_id(tablet_id), _schema(schema), _tablet_schema(tablet_schema), @@ -41,8 +41,9 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _buffer_mem_pool(new MemPool(_mem_tracker.get())), _table_mem_pool(new MemPool(_mem_tracker.get())), _schema_size(_schema->schema_size()), - _rowset_writer(rowset_writer) { - if (config::enable_storage_vectorization){ + _rowset_writer(rowset_writer), + _is_first_insertion(true) { + if (support_vec){ _vec_row_comparator = std::make_shared(_schema); _vec_skip_list = new VecTable(_vec_row_comparator.get(), _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS); @@ -79,14 +80,19 @@ int VecRowComparator::operator()(const RowInBlock left, const RowInBlock right) void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num_rows) { - if (_input_mutable_block.columns() == 0) + if (_is_first_insertion) { + _is_first_insertion = false; auto cloneBlock = block->clone_without_columns(); _input_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); _output_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); } size_t cursor_in_mutableblock = _input_mutable_block.rows(); + size_t oldsize = block->allocated_bytes(); _input_mutable_block.add_rows(block, row_pos, row_pos + num_rows); + size_t newsize = block->allocated_bytes(); + _mem_tracker->Consume(newsize - oldsize); + for(int i = 0; i < num_rows; i++){ insert_one_row_from_block(RowInBlock(&_input_mutable_block, cursor_in_mutableblock + i)); } @@ -204,16 +210,19 @@ vectorized::Block MemTable::to_block() OLAPStatus MemTable::vflush(){ VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id << ", memsize: " << memory_usage() << ", rows: " << _rows; + size_t _flush_size = 0; int64_t duration_ns = 0; { SCOPED_RAW_TIMER(&duration_ns); vectorized::Block block = to_block(); OLAPStatus st = _rowset_writer->add_block(&block); RETURN_NOT_OK(st); + _flush_size = block.allocated_bytes(); } DorisMetrics::instance()->memtable_flush_total->increment(1); DorisMetrics::instance()->memtable_flush_duration_us->increment(duration_ns / 1000); - + VLOG_CRITICAL << "after flush memtable for tablet: " << _tablet_id + << ", flushsize: " << _flush_size; return OLAP_SUCCESS; } OLAPStatus MemTable::vclose() { diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 2af08a667d15f5..8ec011d23273cd 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -40,14 +40,16 @@ class RowInBlock; class VecRowComparator; class MemTable { public: + MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, - KeysType keys_type, RowsetWriter* rowset_writer); + KeysType keys_type, RowsetWriter* rowset_writer, + bool support_vec=false); ~MemTable(); int64_t tablet_id() const { return _tablet_id; } size_t memory_usage() const { return _mem_tracker->consumption(); } - std::shared_ptr mem_tracker() { return _mem_tracker; } + void insert(const Tuple* tuple); //insert tuple from (row_pos) to (row_pos+num_rows) void insert(const vectorized::Block* block, size_t row_pos, size_t num_rows); @@ -149,7 +151,7 @@ class MemTable { vectorized::MutableBlock _input_mutable_block; vectorized::MutableBlock _output_mutable_block; vectorized::Block to_block(); - + bool _is_first_insertion; }; // class MemTable diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index 61814f81050433..1a670bd819f2ed 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -952,4 +952,13 @@ void Block::shrink_char_type_column_suffix_zero(std::vector char_type_id } } } +size_t MutableBlock::allocated_bytes() const { + size_t res = 0; + for (const auto& col : _columns) { + res += col->allocated_bytes(); + } + + return res; +} + } // namespace doris::vectorized diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index fdbf63db215cc1..dca1a73fbc669c 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -423,6 +423,7 @@ class MutableBlock { _columns.clear(); _data_types.clear(); } + size_t allocated_bytes() const; }; } // namespace vectorized From 4bd19edeb61cf00368d8dccc370740557c7b9450 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Fri, 11 Mar 2022 11:14:50 +0800 Subject: [PATCH 10/32] remove RowInBlock._bock --- be/src/olap/memtable.cpp | 37 ++++++++++++++++----------- be/src/olap/memtable.h | 55 ++++++++++++++++++++-------------------- 2 files changed, 49 insertions(+), 43 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 7f89b4a60aa5f6..2a96ad8d66a325 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -44,7 +44,7 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _rowset_writer(rowset_writer), _is_first_insertion(true) { if (support_vec){ - _vec_row_comparator = std::make_shared(_schema); + _vec_row_comparator = std::make_shared(_schema); _vec_skip_list = new VecTable(_vec_row_comparator.get(), _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS); }else{ @@ -71,11 +71,10 @@ int MemTable::RowCursorComparator::operator()(const char* left, const char* righ return compare_row(lhs_row, rhs_row); } -int VecRowComparator::operator()(const RowInBlock left, const RowInBlock right) const{ - return left._block->compare_at(left._row_pos, right._row_pos, +int MemTable::RowInBlockComparator::operator()(const RowInBlock left, const RowInBlock right) const{ + return _pblock->compare_at(left._row_pos, right._row_pos, _schema->num_key_columns(), - *(right._block), -1); - //nan_direction_hint == -1, NaN and NULLs are considered as least than everything other; + *_pblock, -1); } void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num_rows) @@ -94,7 +93,7 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num _mem_tracker->Consume(newsize - oldsize); for(int i = 0; i < num_rows; i++){ - insert_one_row_from_block(RowInBlock(&_input_mutable_block, cursor_in_mutableblock + i)); + insert_one_row_from_block(RowInBlock(cursor_in_mutableblock + i)); } } @@ -178,26 +177,25 @@ void MemTable::_aggregate_two_row(const ContiguousRow& src_row, TableKey row_in_ void MemTable::_aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_skiplist){ if (_tablet_schema->has_sequence_col()) { auto sequence_idx = _tablet_schema->sequence_col_idx(); - auto seq_dst_cell = row_in_skiplist.cell(sequence_idx); - auto seq_src_cell = new_row.cell(sequence_idx); + auto seq_dst_cell = row_in_skiplist.cell(&_input_mutable_block, sequence_idx); + auto seq_src_cell = new_row.cell(&_input_mutable_block, sequence_idx); auto res = _schema->column(sequence_idx)->compare_cell(seq_dst_cell, seq_src_cell); // dst sequence column larger than src, don't need to update if (res > 0) { return; } - } - - + } + //dst is non-sequence row, or dst sequence is smaller for (uint32_t cid = _schema->num_key_columns(); cid < _schema->num_columns(); ++cid) { - auto dst_cell = row_in_skiplist.cell(cid); - auto src_cell = new_row.cell(cid); + auto dst_cell = row_in_skiplist.cell(&_input_mutable_block, cid); + auto src_cell = new_row.cell(&_input_mutable_block, cid); _schema->column(cid)->agg_update(&dst_cell, &src_cell, _table_mem_pool.get()); } } -vectorized::Block MemTable::to_block() +vectorized::Block MemTable::collect_skiplist_results() { VecTable::Iterator it(_vec_skip_list); vectorized::Block in_block = _input_mutable_block.to_block(); @@ -214,7 +212,7 @@ OLAPStatus MemTable::vflush(){ int64_t duration_ns = 0; { SCOPED_RAW_TIMER(&duration_ns); - vectorized::Block block = to_block(); + vectorized::Block block = collect_skiplist_results(); OLAPStatus st = _rowset_writer->add_block(&block); RETURN_NOT_OK(st); _flush_size = block.allocated_bytes(); @@ -225,6 +223,15 @@ OLAPStatus MemTable::vflush(){ << ", flushsize: " << _flush_size; return OLAP_SUCCESS; } + + +vectorized::Block MemTable::flush_to_block(){ + + return collect_skiplist_results(); + +} + + OLAPStatus MemTable::vclose() { return vflush(); } diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 8ec011d23273cd..f60db0a0a7075a 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -37,7 +37,7 @@ class TabletSchema; class Tuple; class TupleDescriptor; class RowInBlock; -class VecRowComparator; +class RowInBlockComparator; class MemTable { public: @@ -63,6 +63,9 @@ class MemTable { //flush for vectorized OLAPStatus vflush(); OLAPStatus vclose(); + + //for test + vectorized::Block flush_to_block(); private: class RowCursorComparator : public RowComparator { public: @@ -73,12 +76,32 @@ class MemTable { const Schema* _schema; }; + //row pos in _input_mutable_block + struct RowInBlock{ + size_t _row_pos; + RowInBlock(size_t i):_row_pos(i){} + RowCursorCell cell(vectorized::MutableBlock* block, int cid){ + StringRef ref = block->mutable_columns()[cid]->get_data_at(_row_pos); + return RowCursorCell(ref.data); + } + }; + class RowInBlockComparator { + public: + RowInBlockComparator(const Schema* schema):_schema(schema){}; + //call set_block before operator(). + //在第一次insert block时创建的 _input_mutable_block, 所以无法在Comparator的构造函数中获得pblock + void set_block(vectorized::MutableBlock* pblock){_pblock = pblock;} + int operator()(const RowInBlock left, const RowInBlock right) const; + private: + const Schema* _schema; + vectorized::MutableBlock* _pblock;// 对应Memtable::_input_mutable_block + }; private: typedef SkipList Table; typedef Table::key_type TableKey; - typedef SkipList VecTable; + typedef SkipList VecTable; public: /// The iterator of memtable, so that the data in this memtable @@ -115,7 +138,7 @@ class MemTable { std::shared_ptr _row_comparator; - std::shared_ptr _vec_row_comparator; + std::shared_ptr _vec_row_comparator; std::shared_ptr _mem_tracker; // This is a buffer, to hold the memory referenced by the rows that have not @@ -150,35 +173,11 @@ class MemTable { //for vectorized vectorized::MutableBlock _input_mutable_block; vectorized::MutableBlock _output_mutable_block; - vectorized::Block to_block(); + vectorized::Block collect_skiplist_results(); bool _is_first_insertion; }; // class MemTable -struct RowInBlock{ - vectorized::MutableBlock* _block; - size_t _row_pos; - RowInBlock(int i):_block(nullptr), _row_pos(0){} //this constructor is for SkipList::NewNode(0, ...) - RowInBlock(vectorized::MutableBlock* block, size_t row_pos):_block(block), _row_pos(row_pos){} - RowCursorCell cell(int cid){ - StringRef ref = _block->mutable_columns()[cid]->get_data_at(_row_pos); - return RowCursorCell(ref.data); - } -}; -class VecRowComparator { -public: - VecRowComparator(const Schema* schema):_schema(schema){}; - int operator()(const RowInBlock left, const RowInBlock right) const; -private: - const Schema* _schema; -}; - -template <> -inline bool SkipList::Iterator::Valid() const { - return node_->key._block != nullptr; -} - - inline std::ostream& operator<<(std::ostream& os, const MemTable& table) { os << "MemTable(addr=" << &table << ", tablet=" << table.tablet_id() << ", mem=" << table.memory_usage(); From a0fba46df4fb1480cf7c0178312fe42aa83138a7 Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Fri, 11 Mar 2022 14:58:55 +0800 Subject: [PATCH 11/32] Use virtual function to create MemTable --- be/src/olap/delta_writer.h | 2 +- be/src/vec/olap/vdelta_writer.cpp | 6 ++++++ be/src/vec/olap/vdelta_writer.h | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index 6c25d5fd2e5d25..d2044526d3b59c 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -100,7 +100,7 @@ class DeltaWriter { void _garbage_collection(); - void _reset_mem_table(); + virtual void _reset_mem_table(); protected: bool _is_init = false; diff --git a/be/src/vec/olap/vdelta_writer.cpp b/be/src/vec/olap/vdelta_writer.cpp index b46954391798f0..4262ad55ad7327 100644 --- a/be/src/vec/olap/vdelta_writer.cpp +++ b/be/src/vec/olap/vdelta_writer.cpp @@ -73,6 +73,12 @@ OLAPStatus VDeltaWriter::write_block(const vectorized::Block* block, const std:: return OLAP_SUCCESS; } +void VDeltaWriter::_reset_mem_table() { + _mem_table.reset(new MemTable(_tablet->tablet_id(), _schema.get(), _tablet_schema, _req.slots, + _req.tuple_desc, _tablet->keys_type(), _rowset_writer.get(), + _mem_tracker, true)); +} + } // namespace vectorized } // namespace doris \ No newline at end of file diff --git a/be/src/vec/olap/vdelta_writer.h b/be/src/vec/olap/vdelta_writer.h index 6716c429a0782f..ac2a6c55568f36 100644 --- a/be/src/vec/olap/vdelta_writer.h +++ b/be/src/vec/olap/vdelta_writer.h @@ -32,6 +32,9 @@ class VDeltaWriter : public DeltaWriter { virtual OLAPStatus write_block(const vectorized::Block* block, const std::vector& row_idxs) override; +protected: + virtual void _reset_mem_table() override; + private: VDeltaWriter(WriteRequest* req, const std::shared_ptr& parent, StorageEngine* storage_engine); From e9e1d549696a69c3b872fb1c28b30380eec8adbc Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Fri, 11 Mar 2022 15:45:16 +0800 Subject: [PATCH 12/32] Fix incorrect idx value in "VDeltaWriter::write_block" --- be/src/vec/olap/vdelta_writer.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/be/src/vec/olap/vdelta_writer.cpp b/be/src/vec/olap/vdelta_writer.cpp index 4262ad55ad7327..c4480c9e2577f5 100644 --- a/be/src/vec/olap/vdelta_writer.cpp +++ b/be/src/vec/olap/vdelta_writer.cpp @@ -54,10 +54,9 @@ OLAPStatus VDeltaWriter::write_block(const vectorized::Block* block, const std:: const size_t num_rows = row_idxs.size(); for (; start < num_rows;) { - auto delta = end + 1 - start; - if (end == num_rows - 1 || (row_idxs[end + 1] - row_idxs[start]) != delta) { - size_t count = delta; - _mem_table->insert(block, start, count); + auto count = end + 1 - start; + if (end == num_rows - 1 || (row_idxs[end + 1] - row_idxs[start]) != count) { + _mem_table->insert(block, row_idxs[start], count); start += count; end = start; } else { From ea6b6307dd7c1c35b0f80c78228dbcdf989b06ef Mon Sep 17 00:00:00 2001 From: jacktengg Date: Wed, 9 Mar 2022 20:16:57 +0800 Subject: [PATCH 13/32] add test case: vectorized broker scan node and tablet sink --- be/src/vec/exec/vbroker_scan_node.cpp | 14 +- be/src/vec/exec/vbroker_scan_node.h | 2 +- be/src/vec/exec/vbroker_scanner.cpp | 6 +- be/test/vec/exec/CMakeLists.txt | 3 + be/test/vec/exec/vbroker_scan_node_test.cpp | 648 +++++++++++++++ be/test/vec/exec/vbroker_scanner_test.cpp | 464 +++++++++++ be/test/vec/exec/vtablet_sink_test.cpp | 835 ++++++++++++++++++++ 7 files changed, 1958 insertions(+), 14 deletions(-) create mode 100644 be/test/vec/exec/vbroker_scan_node_test.cpp create mode 100644 be/test/vec/exec/vbroker_scanner_test.cpp create mode 100644 be/test/vec/exec/vtablet_sink_test.cpp diff --git a/be/src/vec/exec/vbroker_scan_node.cpp b/be/src/vec/exec/vbroker_scan_node.cpp index 2054d100ef023a..bc20bf05e8c78a 100644 --- a/be/src/vec/exec/vbroker_scan_node.cpp +++ b/be/src/vec/exec/vbroker_scan_node.cpp @@ -117,8 +117,7 @@ Status VBrokerScanNode::close(RuntimeState* state) { } Status VBrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, - const std::vector& vconjunct_ctxs, - ScannerCounter* counter) { + ScannerCounter* counter) { //create scanner object and open std::unique_ptr scanner = create_scanner(scan_range, counter); RETURN_IF_ERROR(scanner->open()); @@ -196,18 +195,12 @@ Status VBrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, } void VBrokerScanNode::scanner_worker(int start_idx, int length) { - // Clone expr context - std::vector vscanner_expr_ctxs; - auto status = VExpr::clone_if_not_exists({*_vconjunct_ctx_ptr}, _runtime_state, &vscanner_expr_ctxs); - if (!status.ok()) { - LOG(WARNING) << "Clone conjuncts failed."; - } - + Status status = Status::OK(); ScannerCounter counter; for (int i = 0; i < length && status.ok(); ++i) { const TBrokerScanRange& scan_range = _scan_ranges[start_idx + i].scan_range.broker_scan_range; - status = scanner_scan(scan_range, vscanner_expr_ctxs, &counter); + status = scanner_scan(scan_range, &counter); if (!status.ok()) { LOG(WARNING) << "Scanner[" << start_idx + i << "] process failed. status=" << status.get_error_msg(); @@ -232,7 +225,6 @@ void VBrokerScanNode::scanner_worker(int start_idx, int length) { if (!status.ok()) { _queue_writer_cond.notify_all(); } - VExpr::close(vscanner_expr_ctxs, _runtime_state); } } \ No newline at end of file diff --git a/be/src/vec/exec/vbroker_scan_node.h b/be/src/vec/exec/vbroker_scan_node.h index 384f792ddb11eb..95aa58f822a187 100644 --- a/be/src/vec/exec/vbroker_scan_node.h +++ b/be/src/vec/exec/vbroker_scan_node.h @@ -45,7 +45,7 @@ class VBrokerScanNode : public BrokerScanNode { void scanner_worker(int start_idx, int length); // Scan one range Status scanner_scan(const TBrokerScanRange& scan_range, - const std::vector& vconjunct_ctxs, ScannerCounter* counter); + ScannerCounter* counter); std::deque> _block_queue; }; diff --git a/be/src/vec/exec/vbroker_scanner.cpp b/be/src/vec/exec/vbroker_scanner.cpp index 7c7990990f9611..c52da490f10cce 100644 --- a/be/src/vec/exec/vbroker_scanner.cpp +++ b/be/src/vec/exec/vbroker_scanner.cpp @@ -41,8 +41,11 @@ VBrokerScanner::~VBrokerScanner() { Status VBrokerScanner::get_next(std::vector& columns, bool* eof) { SCOPED_TIMER(_read_timer); + + const int batch_size = _state->batch_size(); + // Get one line - while (!_scanner_eof) { + while (columns[0]->size() < batch_size && !_scanner_eof) { if (_cur_line_reader == nullptr || _cur_line_reader_eof) { RETURN_IF_ERROR(open_next_reader()); // If there isn't any more reader, break this @@ -68,7 +71,6 @@ Status VBrokerScanner::get_next(std::vector& columns, bool* eo if (_success) { free_expr_local_allocations(); } - break; // break always } } if (_scanner_eof) { diff --git a/be/test/vec/exec/CMakeLists.txt b/be/test/vec/exec/CMakeLists.txt index 07bdcbc767d74d..bf0e1ff353f701 100644 --- a/be/test/vec/exec/CMakeLists.txt +++ b/be/test/vec/exec/CMakeLists.txt @@ -19,3 +19,6 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/vec/exec") ADD_BE_TEST(vgeneric_iterators_test) +ADD_BE_TEST(vbroker_scanner_test) +ADD_BE_TEST(vbroker_scan_node_test) +ADD_BE_TEST(vtablet_sink_test) diff --git a/be/test/vec/exec/vbroker_scan_node_test.cpp b/be/test/vec/exec/vbroker_scan_node_test.cpp new file mode 100644 index 00000000000000..0846a26f096015 --- /dev/null +++ b/be/test/vec/exec/vbroker_scan_node_test.cpp @@ -0,0 +1,648 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "vec/exec/vbroker_scan_node.h" + +#include + +#include +#include +#include + +#include "common/object_pool.h" +#include "exprs/binary_predicate.h" +#include "runtime/primitive_type.h" +#include "exprs/slot_ref.h" +#include "exprs/literal.h" +#include "runtime/mem_tracker.h" +#include "exec/local_file_reader.h" +#include "exprs/cast_functions.h" +#include "gen_cpp/Descriptors_types.h" +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/descriptors.h" +#include "runtime/runtime_state.h" +#include "runtime/user_function_cache.h" + +namespace doris { + +Expr* create_literal(ObjectPool* pool, PrimitiveType type, const void* data); + +namespace vectorized { +class VBrokerScanNodeTest : public testing::Test { +public: + VBrokerScanNodeTest() : _runtime_state(TQueryGlobals()) { + init(); + _runtime_state._instance_mem_tracker.reset(new MemTracker()); + } + void init(); + static void SetUpTestCase() { + UserFunctionCache::instance()->init( + "./be/test/runtime/test_data/user_function_cache/normal"); + CastFunctions::init(); + } + +protected: + virtual void SetUp() {} + virtual void TearDown() {} + +private: + void init_desc_table(); + RuntimeState _runtime_state; + ObjectPool _obj_pool; + std::map _slots_map; + TBrokerScanRangeParams _params; + DescriptorTbl* _desc_tbl; + TPlanNode _tnode; +}; + +void VBrokerScanNodeTest::init_desc_table() { + TDescriptorTable t_desc_table; + + // table descriptors + TTableDescriptor t_table_desc; + + t_table_desc.id = 0; + t_table_desc.tableType = TTableType::OLAP_TABLE; + t_table_desc.numCols = 0; + t_table_desc.numClusteringCols = 0; + t_desc_table.tableDescriptors.push_back(t_table_desc); + t_desc_table.__isset.tableDescriptors = true; + + int next_slot_id = 1; + // TSlotDescriptor + // int offset = 1; + // int i = 0; + // k1 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 0; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 0; + slot_desc.byteOffset = 0; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k1"; + slot_desc.slotIdx = 1; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k2 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 0; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 4; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k2"; + slot_desc.slotIdx = 2; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k3 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 0; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 8; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k3"; + slot_desc.slotIdx = 3; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k4(partitioned column) + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 0; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 12; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k4"; + slot_desc.slotIdx = 4; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + + t_desc_table.__isset.slotDescriptors = true; + { + // TTupleDescriptor dest + TTupleDescriptor t_tuple_desc; + t_tuple_desc.id = 0; + t_tuple_desc.byteSize = 16; + t_tuple_desc.numNullBytes = 0; + t_tuple_desc.tableId = 0; + t_tuple_desc.__isset.tableId = true; + t_desc_table.tupleDescriptors.push_back(t_tuple_desc); + } + + // source tuple descriptor + // TSlotDescriptor + // int offset = 1; + // int i = 0; + // k1 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 1; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 0; + slot_desc.byteOffset = 0; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k1"; + slot_desc.slotIdx = 1; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k2 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 1; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 16; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k2"; + slot_desc.slotIdx = 2; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k3 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 1; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 32; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k3"; + slot_desc.slotIdx = 3; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k4(partitioned column) + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 1; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 48; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k4"; + slot_desc.slotIdx = 4; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + + { + // TTupleDescriptor source + TTupleDescriptor t_tuple_desc; + t_tuple_desc.id = 1; + t_tuple_desc.byteSize = 64; + t_tuple_desc.numNullBytes = 0; + t_tuple_desc.tableId = 0; + t_tuple_desc.__isset.tableId = true; + t_desc_table.tupleDescriptors.push_back(t_tuple_desc); + } + + DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); + + _runtime_state.set_desc_tbl(_desc_tbl); +} + +void VBrokerScanNodeTest::init() { + _params.column_separator = ','; + _params.line_delimiter = '\n'; + + TTypeDesc int_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + int_type.types.push_back(node); + } + TTypeDesc varchar_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(5000); + node.__set_scalar_type(scalar_type); + varchar_type.types.push_back(node); + } + + for (int i = 0; i < 4; ++i) { + TExprNode cast_expr; + cast_expr.node_type = TExprNodeType::CAST_EXPR; + cast_expr.type = int_type; + cast_expr.__set_opcode(TExprOpcode::CAST); + cast_expr.__set_num_children(1); + cast_expr.__set_output_scale(-1); + cast_expr.__isset.fn = true; + cast_expr.fn.name.function_name = "casttoint"; + cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; + cast_expr.fn.arg_types.push_back(varchar_type); + cast_expr.fn.ret_type = int_type; + cast_expr.fn.has_var_args = false; + cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); + cast_expr.fn.__isset.scalar_fn = true; + cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_int_val"; + + TExprNode slot_ref; + slot_ref.node_type = TExprNodeType::SLOT_REF; + slot_ref.type = varchar_type; + slot_ref.num_children = 0; + slot_ref.__isset.slot_ref = true; + slot_ref.slot_ref.slot_id = 5 + i; + slot_ref.slot_ref.tuple_id = 1; + + TExpr expr; + expr.nodes.push_back(cast_expr); + expr.nodes.push_back(slot_ref); + + _params.expr_of_dest_slot.emplace(i + 1, expr); + _params.src_slot_ids.push_back(5 + i); + } + // _params.__isset.expr_of_dest_slot = true; + _params.__set_dest_tuple_id(0); + _params.__set_src_tuple_id(1); + + init_desc_table(); + + // Node Id + _tnode.node_id = 0; + _tnode.node_type = TPlanNodeType::BROKER_SCAN_NODE; + _tnode.num_children = 0; + _tnode.limit = -1; + _tnode.row_tuples.push_back(0); + _tnode.nullable_tuples.push_back(false); + _tnode.broker_scan_node.tuple_id = 0; + _tnode.__isset.broker_scan_node = true; +} + +TEST_F(VBrokerScanNodeTest, normal) { + VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); + scan_node.init(_tnode); + auto status = scan_node.prepare(&_runtime_state); + ASSERT_TRUE(status.ok()); + + // set scan range + std::vector scan_ranges; + + { + TScanRangeParams scan_range_params; + + TBrokerScanRange broker_scan_range; + broker_scan_range.params = _params; + + TBrokerRangeDesc range; + range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; + range.start_offset = 0; + range.size = -1; + range.file_type = TFileType::FILE_LOCAL; + range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + range.splittable = true; + std::vector columns_from_path{"1"}; + range.__set_columns_from_path(columns_from_path); + range.__set_num_of_columns_from_file(3); + broker_scan_range.ranges.push_back(range); + + scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); + + scan_ranges.push_back(scan_range_params); + } + { + TScanRangeParams scan_range_params; + + TBrokerScanRange broker_scan_range; + broker_scan_range.params = _params; + + TBrokerRangeDesc range; + range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; + range.start_offset = 1; + range.size = 7; + range.file_type = TFileType::FILE_LOCAL; + range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + range.splittable = true; + std::vector columns_from_path{"2"}; + range.__set_columns_from_path(columns_from_path); + range.__set_num_of_columns_from_file(3); + broker_scan_range.ranges.push_back(range); + + scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); + + scan_ranges.push_back(scan_range_params); + } + + scan_node.set_scan_ranges(scan_ranges); + + status = scan_node.open(&_runtime_state); + ASSERT_TRUE(status.ok()); + + doris::vectorized::Block block; + bool eos = false; + status = scan_node.get_next(&_runtime_state, &block, &eos); + ASSERT_EQ(3, block.rows()); + ASSERT_EQ(4, block.columns()); + ASSERT_FALSE(eos); + + auto columns = block.get_columns(); + ASSERT_EQ(columns[0]->get_int(0), 1); + ASSERT_EQ(columns[0]->get_int(1), 4); + ASSERT_EQ(columns[0]->get_int(2), 8); + + ASSERT_EQ(columns[1]->get_int(0), 2); + ASSERT_EQ(columns[1]->get_int(1), 5); + ASSERT_EQ(columns[1]->get_int(2), 9); + + ASSERT_EQ(columns[2]->get_int(0), 3); + ASSERT_EQ(columns[2]->get_int(1), 6); + ASSERT_EQ(columns[2]->get_int(2), 10); + + ASSERT_EQ(columns[3]->get_int(0), 1); + ASSERT_EQ(columns[3]->get_int(1), 1); + ASSERT_EQ(columns[3]->get_int(2), 1); + + block.clear(); + status = scan_node.get_next(&_runtime_state, &block, &eos); + ASSERT_EQ(1, block.rows()); + ASSERT_FALSE(eos); + + columns = block.get_columns(); + ASSERT_EQ(columns[0]->get_int(0), 4); + ASSERT_EQ(columns[1]->get_int(0), 5); + ASSERT_EQ(columns[2]->get_int(0), 6); + ASSERT_EQ(columns[3]->get_int(0), 2); + + block.clear(); + status = scan_node.get_next(&_runtime_state, &block, &eos); + ASSERT_EQ(0, block.rows()); + ASSERT_TRUE(eos); + + scan_node.close(&_runtime_state); + { + std::stringstream ss; + scan_node.runtime_profile()->pretty_print(&ss); + LOG(INFO) << ss.str(); + } +} + +TEST_F(VBrokerScanNodeTest, where_binary_pre) { + TPlanNode _tnode_ = _tnode; + + TTypeDesc int_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + int_type.types.push_back(node); + } + TExpr expr; + { + TExprNode expr_node; + expr_node.__set_node_type(TExprNodeType::BINARY_PRED); + expr_node.type = gen_type_desc(TPrimitiveType::BOOLEAN); + expr_node.__set_num_children(2); + expr_node.__isset.opcode = true; + expr_node.__set_opcode(TExprOpcode::LT); + expr_node.__isset.vector_opcode = true; + expr_node.__set_vector_opcode(TExprOpcode::LT); + expr_node.__isset.fn = true; + expr_node.fn.name.function_name = "lt"; + expr_node.fn.binary_type = TFunctionBinaryType::BUILTIN; + expr_node.fn.ret_type = int_type; + expr_node.fn.has_var_args = false; + expr.nodes.push_back(expr_node); + + } + { + TExprNode expr_node; + expr_node.__set_node_type(TExprNodeType::SLOT_REF); + expr_node.type = int_type; + expr_node.__set_num_children(0); + expr_node.__isset.slot_ref = true; + TSlotRef slot_ref; + slot_ref.__set_slot_id(1); + slot_ref.__set_tuple_id(0); + expr_node.__set_slot_ref(slot_ref); + expr_node.__isset.output_column = true; + expr_node.__set_output_column(0); + expr.nodes.push_back(expr_node); + } + { + TExprNode expr_node; + expr_node.__set_node_type(TExprNodeType::INT_LITERAL); + expr_node.type = int_type; + expr_node.__set_num_children(0); + expr_node.__isset.int_literal = true; + TIntLiteral int_literal; + int_literal.__set_value(8); + expr_node.__set_int_literal(int_literal); + expr.nodes.push_back(expr_node); + } + _tnode_.__set_vconjunct(expr); + + VBrokerScanNode scan_node(&_obj_pool, _tnode_, *_desc_tbl); + auto status = scan_node.init(_tnode_); + ASSERT_TRUE(status.ok()); + status = scan_node.prepare(&_runtime_state); + ASSERT_TRUE(status.ok()); + + // set scan range + std::vector scan_ranges; + + { + TScanRangeParams scan_range_params; + + TBrokerScanRange broker_scan_range; + broker_scan_range.params = _params; + + TBrokerRangeDesc range; + range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; + range.start_offset = 0; + range.size = -1; + range.file_type = TFileType::FILE_LOCAL; + range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + range.splittable = true; + std::vector columns_from_path{"1"}; + range.__set_columns_from_path(columns_from_path); + range.__set_num_of_columns_from_file(3); + broker_scan_range.ranges.push_back(range); + + scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); + + scan_ranges.push_back(scan_range_params); + } + + scan_node.set_scan_ranges(scan_ranges); + + status = scan_node.open(&_runtime_state); + ASSERT_TRUE(status.ok()); + + doris::vectorized::Block block; + bool eos = false; + status = scan_node.get_next(&_runtime_state, &block, &eos); + ASSERT_EQ(2, block.rows()); + ASSERT_EQ(4, block.columns()); + + auto columns = block.get_columns(); + ASSERT_EQ(columns[0]->get_int(0), 1); + ASSERT_EQ(columns[0]->get_int(1), 4); + + ASSERT_EQ(columns[1]->get_int(0), 2); + ASSERT_EQ(columns[1]->get_int(1), 5); + + ASSERT_EQ(columns[2]->get_int(0), 3); + ASSERT_EQ(columns[2]->get_int(1), 6); + + ASSERT_EQ(columns[3]->get_int(0), 1); + ASSERT_EQ(columns[3]->get_int(1), 1); + + ASSERT_FALSE(eos); + + block.clear(); + status = scan_node.get_next(&_runtime_state, &block, &eos); + ASSERT_EQ(0, block.rows()); + ASSERT_TRUE(eos); + + scan_node.close(&_runtime_state); + { + std::stringstream ss; + scan_node.runtime_profile()->pretty_print(&ss); + LOG(INFO) << ss.str(); + } +} + + +} // namespace vectorized +} // namespace doris +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/be/test/vec/exec/vbroker_scanner_test.cpp b/be/test/vec/exec/vbroker_scanner_test.cpp new file mode 100644 index 00000000000000..6064abe655a8c2 --- /dev/null +++ b/be/test/vec/exec/vbroker_scanner_test.cpp @@ -0,0 +1,464 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "vec/exec/vbroker_scanner.h" + +#include + +#include +#include +#include + +#include "common/object_pool.h" +#include "runtime/mem_tracker.h" +#include "exec/local_file_reader.h" +#include "exprs/cast_functions.h" +#include "gen_cpp/Descriptors_types.h" +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/descriptors.h" +#include "runtime/runtime_state.h" +#include "runtime/user_function_cache.h" + +namespace doris { + +namespace vectorized { +class VBrokerScannerTest : public testing::Test { +public: + VBrokerScannerTest() : _runtime_state(TQueryGlobals()) { + init(); + _profile = _runtime_state.runtime_profile(); + _runtime_state._instance_mem_tracker.reset(new MemTracker()); + } + void init(); + + static void SetUpTestCase() { + UserFunctionCache::instance()->init( + "./be/test/runtime/test_data/user_function_cache/normal"); + CastFunctions::init(); + } + +protected: + virtual void SetUp() {} + virtual void TearDown() {} + +private: + void init_desc_table(); + void init_params(); + + TupleId _dst_tuple_id = 0; + TupleId _src_tuple_id = 1; + RuntimeState _runtime_state; + RuntimeProfile* _profile; + ObjectPool _obj_pool; + TBrokerScanRangeParams _params; + DescriptorTbl* _desc_tbl; + std::vector _addresses; + ScannerCounter _counter; + std::vector _pre_filter; +}; + +void VBrokerScannerTest::init_desc_table() { + TDescriptorTable t_desc_table; + + // table descriptors + TTableDescriptor t_table_desc; + + t_table_desc.id = 0; + t_table_desc.tableType = TTableType::OLAP_TABLE; + t_table_desc.numCols = 0; + t_table_desc.numClusteringCols = 0; + t_desc_table.tableDescriptors.push_back(t_table_desc); + t_desc_table.__isset.tableDescriptors = true; + + int next_slot_id = 1; + // TSlotDescriptor + // int offset = 1; + // int i = 0; + // k1 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 0; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 0; + slot_desc.byteOffset = 0; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k1"; + slot_desc.slotIdx = 1; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k2 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 0; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 4; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k2"; + slot_desc.slotIdx = 2; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k3 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 0; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 2; + slot_desc.byteOffset = 8; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k3"; + slot_desc.slotIdx = 3; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + + t_desc_table.__isset.slotDescriptors = true; + { + // TTupleDescriptor dest + TTupleDescriptor t_tuple_desc; + t_tuple_desc.id = 0; + t_tuple_desc.byteSize = 12; + t_tuple_desc.numNullBytes = 0; + t_tuple_desc.tableId = 0; + t_tuple_desc.__isset.tableId = true; + t_desc_table.tupleDescriptors.push_back(t_tuple_desc); + } + + // source tuple descriptor + // TSlotDescriptor + // int offset = 1; + // int i = 0; + // k1 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 1; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 0; + slot_desc.byteOffset = 0; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k1"; + slot_desc.slotIdx = 1; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k2 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 1; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 1; + slot_desc.byteOffset = 16; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k2"; + slot_desc.slotIdx = 2; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + // k3 + { + TSlotDescriptor slot_desc; + + slot_desc.id = next_slot_id++; + slot_desc.parent = 1; + TTypeDesc type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + type.types.push_back(node); + } + slot_desc.slotType = type; + slot_desc.columnPos = 2; + slot_desc.byteOffset = 32; + slot_desc.nullIndicatorByte = 0; + slot_desc.nullIndicatorBit = -1; + slot_desc.colName = "k3"; + slot_desc.slotIdx = 3; + slot_desc.isMaterialized = true; + + t_desc_table.slotDescriptors.push_back(slot_desc); + } + + { + // TTupleDescriptor source + TTupleDescriptor t_tuple_desc; + t_tuple_desc.id = 1; + t_tuple_desc.byteSize = 48; + t_tuple_desc.numNullBytes = 0; + t_tuple_desc.tableId = 0; + t_tuple_desc.__isset.tableId = true; + t_desc_table.tupleDescriptors.push_back(t_tuple_desc); + } + + DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); + + _runtime_state.set_desc_tbl(_desc_tbl); +} + +void VBrokerScannerTest::init_params() { + _params.column_separator = ','; + _params.line_delimiter = '\n'; + + TTypeDesc int_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::INT); + node.__set_scalar_type(scalar_type); + int_type.types.push_back(node); + } + TTypeDesc varchar_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(5000); + node.__set_scalar_type(scalar_type); + varchar_type.types.push_back(node); + } + + for (int i = 0; i < 3; ++i) { + TExprNode cast_expr; + cast_expr.node_type = TExprNodeType::CAST_EXPR; + cast_expr.type = int_type; + cast_expr.__set_opcode(TExprOpcode::CAST); + cast_expr.__set_num_children(1); + cast_expr.__set_output_scale(-1); + cast_expr.__isset.fn = true; + cast_expr.fn.name.function_name = "casttoint"; + cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; + cast_expr.fn.arg_types.push_back(varchar_type); + cast_expr.fn.ret_type = int_type; + cast_expr.fn.has_var_args = false; + cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); + cast_expr.fn.__isset.scalar_fn = true; + cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_int_val"; + + TExprNode slot_ref; + slot_ref.node_type = TExprNodeType::SLOT_REF; + slot_ref.type = varchar_type; + slot_ref.num_children = 0; + slot_ref.__isset.slot_ref = true; + slot_ref.slot_ref.slot_id = 4 + i; + slot_ref.slot_ref.tuple_id = 1; + + TExpr expr; + expr.nodes.push_back(cast_expr); + expr.nodes.push_back(slot_ref); + + _params.expr_of_dest_slot.emplace(i + 1, expr); + _params.src_slot_ids.push_back(4 + i); + } + // _params.__isset.expr_of_dest_slot = true; + _params.__set_dest_tuple_id(_dst_tuple_id); + _params.__set_src_tuple_id(_src_tuple_id); +} + +void VBrokerScannerTest::init() { + init_desc_table(); + init_params(); +} + +TEST_F(VBrokerScannerTest, normal) { + std::vector ranges; + TBrokerRangeDesc range; + range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; + range.start_offset = 0; + range.size = -1; + range.splittable = true; + range.file_type = TFileType::FILE_LOCAL; + range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + ranges.push_back(range); + + VBrokerScanner scanner(&_runtime_state, _profile, _params, ranges, _addresses, _pre_filter, &_counter); + auto st = scanner.open(); + ASSERT_TRUE(st.ok()); + + int slot_count = 3; + auto tuple_desc = _desc_tbl->get_tuple_descriptor(_dst_tuple_id); + std::vector columns(slot_count); + for (int i = 0; i < slot_count; i++) { + columns[i] = tuple_desc->slots()[i]->get_empty_mutable_column(); + } + bool eof = false; + st = scanner.get_next(columns, &eof); + ASSERT_TRUE(st.ok()); + ASSERT_TRUE(eof); + + ASSERT_EQ(columns[0]->get_int(0), 1); + ASSERT_EQ(columns[0]->get_int(1), 4); + ASSERT_EQ(columns[0]->get_int(2), 8); + + ASSERT_EQ(columns[1]->get_int(0), 2); + ASSERT_EQ(columns[1]->get_int(1), 5); + ASSERT_EQ(columns[1]->get_int(2), 9); + + ASSERT_EQ(columns[2]->get_int(0), 3); + ASSERT_EQ(columns[2]->get_int(1), 6); + ASSERT_EQ(columns[2]->get_int(2), 10); +} + +TEST_F(VBrokerScannerTest, normal2) { + std::vector ranges; + + TBrokerRangeDesc range; + range.path = "./be/test/exec/test_data/broker_scanner/normal2_1.csv"; + range.start_offset = 0; + range.size = 7; + range.splittable = true; + range.file_type = TFileType::FILE_LOCAL; + range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + ranges.push_back(range); + + range.path = "./be/test/exec/test_data/broker_scanner/normal2_2.csv"; + range.start_offset = 0; + range.size = 4; + ranges.push_back(range); + + VBrokerScanner scanner(&_runtime_state, _profile, _params, ranges, _addresses, _pre_filter, &_counter); + auto st = scanner.open(); + ASSERT_TRUE(st.ok()); + + int slot_count = 3; + auto tuple_desc = _desc_tbl->get_tuple_descriptor(_dst_tuple_id); + std::vector columns(slot_count); + for (int i = 0; i < slot_count; i++) { + columns[i] = tuple_desc->slots()[i]->get_empty_mutable_column(); + } + + bool eof = false; + st = scanner.get_next(columns, &eof); + ASSERT_TRUE(st.ok()); + ASSERT_TRUE(eof); + + ASSERT_EQ(columns[0]->get_int(0), 1); + ASSERT_EQ(columns[0]->get_int(1), 3); + + ASSERT_EQ(columns[1]->get_int(0), 2); + ASSERT_EQ(columns[1]->get_int(1), 4); + + ASSERT_EQ(columns[2]->get_int(0), 3); + ASSERT_EQ(columns[2]->get_int(1), 5); +} + +TEST_F(VBrokerScannerTest, normal5) { + std::vector ranges; + TBrokerRangeDesc range; + range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; + range.start_offset = 0; + range.size = 0; + range.splittable = true; + range.file_type = TFileType::FILE_LOCAL; + range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + ranges.push_back(range); + + VBrokerScanner scanner(&_runtime_state, _profile, _params, ranges, _addresses, _pre_filter, &_counter); + auto st = scanner.open(); + ASSERT_TRUE(st.ok()); + + int slot_count = 3; + auto tuple_desc = _desc_tbl->get_tuple_descriptor(_dst_tuple_id); + std::vector columns(slot_count); + for (int i = 0; i < slot_count; i++) { + columns[i] = tuple_desc->slots()[i]->get_empty_mutable_column(); + } + bool eof = false; + // end of file + st = scanner.get_next(columns, &eof); + ASSERT_TRUE(st.ok()); + ASSERT_TRUE(eof); + ASSERT_EQ(columns[0]->size(), 0); +} +} // namespace vectorized +} // namespace doris +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/be/test/vec/exec/vtablet_sink_test.cpp b/be/test/vec/exec/vtablet_sink_test.cpp new file mode 100644 index 00000000000000..31d9ed84cbc270 --- /dev/null +++ b/be/test/vec/exec/vtablet_sink_test.cpp @@ -0,0 +1,835 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "vec/sink/vtablet_sink.h" + +#include + +#include +#include +#include + +#include "common/config.h" +#include "gen_cpp/HeartbeatService_types.h" +#include "gen_cpp/internal_service.pb.h" +#include "runtime/bufferpool/reservation_tracker.h" +#include "runtime/decimalv2_value.h" +#include "runtime/descriptor_helper.h" +#include "runtime/exec_env.h" +#include "runtime/result_queue_mgr.h" +#include "runtime/runtime_state.h" +#include "runtime/stream_load/load_stream_mgr.h" +#include "runtime/thread_resource_mgr.h" +#include "runtime/types.h" +#include "service/brpc.h" +#include "util/brpc_client_cache.h" +#include "util/cpu_info.h" +#include "util/debug/leakcheck_disabler.h" +#include "util/proto_util.h" + +namespace doris { + +namespace stream_load { + +Status k_add_batch_status; + +class VOlapTableSinkTest : public testing::Test { +public: + VOlapTableSinkTest() {} + virtual ~VOlapTableSinkTest() {} + void SetUp() override { + k_add_batch_status = Status::OK(); + _env = ExecEnv::GetInstance(); + _env->_thread_mgr = new ThreadResourceMgr(); + _env->_master_info = new TMasterInfo(); + _env->_load_stream_mgr = new LoadStreamMgr(); + _env->_internal_client_cache = new BrpcClientCache(); + _env->_function_client_cache = new BrpcClientCache(); + _env->_buffer_reservation = new ReservationTracker(); + ThreadPoolBuilder("SendBatchThreadPool") + .set_min_threads(1) + .set_max_threads(5) + .set_max_queue_size(100) + .build(&_env->_send_batch_thread_pool); + config::tablet_writer_open_rpc_timeout_sec = 60; + config::max_send_batch_parallelism_per_job = 1; + } + + void TearDown() override { + SAFE_DELETE(_env->_internal_client_cache); + SAFE_DELETE(_env->_function_client_cache); + SAFE_DELETE(_env->_load_stream_mgr); + SAFE_DELETE(_env->_master_info); + SAFE_DELETE(_env->_thread_mgr); + SAFE_DELETE(_env->_buffer_reservation); + if (_server) { + _server->Stop(100); + _server->Join(); + SAFE_DELETE(_server); + } + } + +private: + ExecEnv* _env = nullptr; + brpc::Server* _server = nullptr; +}; + +TDataSink get_data_sink(TDescriptorTable* desc_tbl) { + int64_t db_id = 1; + int64_t table_id = 2; + int64_t partition_id = 3; + int64_t index1_id = 4; + int64_t tablet1_id = 6; + int64_t tablet2_id = 7; + + TDataSink data_sink; + data_sink.type = TDataSinkType::OLAP_TABLE_SINK; + data_sink.__isset.olap_table_sink = true; + + TOlapTableSink& tsink = data_sink.olap_table_sink; + tsink.load_id.hi = 123; + tsink.load_id.lo = 456; + tsink.txn_id = 789; + tsink.db_id = 1; + tsink.table_id = 2; + tsink.tuple_id = 0; + tsink.num_replicas = 3; + tsink.db_name = "testDb"; + tsink.table_name = "testTable"; + + // construct schema + TOlapTableSchemaParam& tschema = tsink.schema; + tschema.db_id = 1; + tschema.table_id = 2; + tschema.version = 0; + + // descriptor + { + TDescriptorTableBuilder dtb; + { + TTupleDescriptorBuilder tuple_builder; + + tuple_builder.add_slot(TSlotDescriptorBuilder() + .type(TYPE_INT) + .column_name("c1") + .column_pos(1) + .nullable(false) + .build()); + tuple_builder.add_slot(TSlotDescriptorBuilder() + .type(TYPE_BIGINT) + .column_name("c2") + .column_pos(2) + .nullable(false) + .build()); + tuple_builder.add_slot(TSlotDescriptorBuilder() + .string_type(10) + .column_name("c3") + .column_pos(3) + .nullable(false) + .build()); + + tuple_builder.build(&dtb); + } + { + TTupleDescriptorBuilder tuple_builder; + + tuple_builder.add_slot(TSlotDescriptorBuilder() + .type(TYPE_INT) + .column_name("c1") + .column_pos(1) + .build()); + tuple_builder.add_slot(TSlotDescriptorBuilder() + .type(TYPE_BIGINT) + .column_name("c2") + .column_pos(2) + .build()); + tuple_builder.add_slot(TSlotDescriptorBuilder() + .string_type(20) + .column_name("c3") + .column_pos(3) + .build()); + + tuple_builder.build(&dtb); + } + + *desc_tbl = dtb.desc_tbl(); + tschema.slot_descs = desc_tbl->slotDescriptors; + tschema.tuple_desc = desc_tbl->tupleDescriptors[0]; + } + // index + tschema.indexes.resize(1); + tschema.indexes[0].id = index1_id; + tschema.indexes[0].columns = {"c1", "c2", "c3"}; + // tschema.indexes[1].id = 5; + // tschema.indexes[1].columns = {"c1", "c3"}; + // partition + TOlapTablePartitionParam& tpartition = tsink.partition; + tpartition.db_id = db_id; + tpartition.table_id = table_id; + tpartition.version = table_id; + tpartition.__set_partition_column("c2"); + tpartition.__set_distributed_columns({"c1", "c3"}); + tpartition.partitions.resize(1); + tpartition.partitions[0].id = partition_id; + tpartition.partitions[0].num_buckets = 2; + tpartition.partitions[0].indexes.resize(1); + tpartition.partitions[0].indexes[0].index_id = index1_id; + tpartition.partitions[0].indexes[0].tablets = {tablet1_id, tablet2_id}; + // location + TOlapTableLocationParam& location = tsink.location; + location.db_id = db_id; + location.table_id = table_id; + location.version = 0; + location.tablets.resize(2); + location.tablets[0].tablet_id = tablet1_id; + location.tablets[0].node_ids = {0, 1, 2}; + location.tablets[1].tablet_id = tablet2_id; + location.tablets[1].node_ids = {0, 1, 2}; + // location + TPaloNodesInfo& nodes_info = tsink.nodes_info; + nodes_info.nodes.resize(3); + nodes_info.nodes[0].id = 0; + nodes_info.nodes[0].host = "127.0.0.1"; + nodes_info.nodes[0].async_internal_port = 4356; + nodes_info.nodes[1].id = 1; + nodes_info.nodes[1].host = "127.0.0.1"; + nodes_info.nodes[1].async_internal_port = 4356; + nodes_info.nodes[2].id = 2; + nodes_info.nodes[2].host = "127.0.0.1"; + nodes_info.nodes[2].async_internal_port = 4357; + + return data_sink; +} + +TDataSink get_decimal_sink(TDescriptorTable* desc_tbl) { + int64_t db_id = 1; + int64_t table_id = 2; + int64_t partition_id = 3; + int64_t index1_id = 4; + int64_t tablet1_id = 6; + int64_t tablet2_id = 7; + + TDataSink data_sink; + data_sink.type = TDataSinkType::OLAP_TABLE_SINK; + data_sink.__isset.olap_table_sink = true; + + TOlapTableSink& tsink = data_sink.olap_table_sink; + tsink.load_id.hi = 123; + tsink.load_id.lo = 456; + tsink.txn_id = 789; + tsink.db_id = 1; + tsink.table_id = 2; + tsink.tuple_id = 0; + tsink.num_replicas = 3; + tsink.db_name = "testDb"; + tsink.table_name = "testTable"; + + // construct schema + TOlapTableSchemaParam& tschema = tsink.schema; + tschema.db_id = 1; + tschema.table_id = 2; + tschema.version = 0; + + // descriptor + { + TDescriptorTableBuilder dtb; + { + TTupleDescriptorBuilder tuple_builder; + + tuple_builder.add_slot(TSlotDescriptorBuilder() + .type(TYPE_INT) + .column_name("c1") + .column_pos(1) + .nullable(false) + .build()); + tuple_builder.add_slot(TSlotDescriptorBuilder() + .decimal_type(5, 2) + .column_name("c2") + .column_pos(2) + .nullable(false) + .build()); + + tuple_builder.build(&dtb); + } + + *desc_tbl = dtb.desc_tbl(); + tschema.slot_descs = desc_tbl->slotDescriptors; + tschema.tuple_desc = desc_tbl->tupleDescriptors[0]; + } + // index + tschema.indexes.resize(1); + tschema.indexes[0].id = index1_id; + tschema.indexes[0].columns = {"c1", "c2"}; + // tschema.indexes[1].id = 5; + // tschema.indexes[1].columns = {"c1", "c3"}; + // partition + TOlapTablePartitionParam& tpartition = tsink.partition; + tpartition.db_id = db_id; + tpartition.table_id = table_id; + tpartition.version = table_id; + tpartition.__set_partition_column("c1"); + tpartition.__set_distributed_columns({"c2"}); + tpartition.partitions.resize(1); + tpartition.partitions[0].id = partition_id; + tpartition.partitions[0].num_buckets = 2; + tpartition.partitions[0].indexes.resize(1); + tpartition.partitions[0].indexes[0].index_id = index1_id; + tpartition.partitions[0].indexes[0].tablets = {tablet1_id, tablet2_id}; + // location + TOlapTableLocationParam& location = tsink.location; + location.db_id = db_id; + location.table_id = table_id; + location.version = 0; + location.tablets.resize(2); + location.tablets[0].tablet_id = tablet1_id; + location.tablets[0].node_ids = {0, 1, 2}; + location.tablets[1].tablet_id = tablet2_id; + location.tablets[1].node_ids = {0, 1, 2}; + // location + TPaloNodesInfo& nodes_info = tsink.nodes_info; + nodes_info.nodes.resize(3); + nodes_info.nodes[0].id = 0; + nodes_info.nodes[0].host = "127.0.0.1"; + nodes_info.nodes[0].async_internal_port = 4356; + nodes_info.nodes[1].id = 1; + nodes_info.nodes[1].host = "127.0.0.1"; + nodes_info.nodes[1].async_internal_port = 4356; + nodes_info.nodes[2].id = 2; + nodes_info.nodes[2].host = "127.0.0.1"; + nodes_info.nodes[2].async_internal_port = 4357; + + return data_sink; +} + +class TestInternalService : public PBackendService { +public: + TestInternalService() {} + virtual ~TestInternalService() {} + + void transmit_data(::google::protobuf::RpcController* controller, + const ::doris::PTransmitDataParams* request, + ::doris::PTransmitDataResult* response, + ::google::protobuf::Closure* done) override { + brpc::ClosureGuard done_guard(done); + } + + void tablet_writer_open(google::protobuf::RpcController* controller, + const PTabletWriterOpenRequest* request, + PTabletWriterOpenResult* response, + google::protobuf::Closure* done) override { + brpc::ClosureGuard done_guard(done); + Status status; + status.to_protobuf(response->mutable_status()); + } + + void tablet_writer_add_block(google::protobuf::RpcController* controller, + const PTabletWriterAddBlockRequest* request, + PTabletWriterAddBlockResult* response, + google::protobuf::Closure* done) { + brpc::ClosureGuard done_guard(done); + { + std::lock_guard l(_lock); + _row_counters += request->tablet_ids_size(); + if (request->eos()) { + _eof_counters++; + } + k_add_batch_status.to_protobuf(response->mutable_status()); + + if (request->has_block() && _row_desc != nullptr) { + brpc::Controller* cntl = static_cast(controller); + attachment_transfer_request_block(request, cntl); + vectorized::Block block(request->block()); + + for (size_t row_num = 0; row_num < block.rows(); ++row_num) { + std::stringstream out; + out << "("; + for (size_t i = 0; i < block.columns(); ++i) { + if (block.get_by_position(i).column) { + out << block.get_by_position(i).to_string(row_num); + } + if (i != block.columns() - 1) { + out << ", "; + } + } + out << ")"; + _output_set->emplace(out.str()); + LOG(INFO) << out.str(); + } + } + } + } + void tablet_writer_cancel(google::protobuf::RpcController* controller, + const PTabletWriterCancelRequest* request, + PTabletWriterCancelResult* response, + google::protobuf::Closure* done) override { + brpc::ClosureGuard done_guard(done); + } + + std::mutex _lock; + int64_t _eof_counters = 0; + int64_t _row_counters = 0; + RowDescriptor* _row_desc = nullptr; + std::set* _output_set = nullptr; +}; + +TEST_F(VOlapTableSinkTest, normal) { + // start brpc service first + _server = new brpc::Server(); + auto service = new TestInternalService(); + ASSERT_EQ(_server->AddService(service, brpc::SERVER_OWNS_SERVICE), 0); + brpc::ServerOptions options; + { + debug::ScopedLeakCheckDisabler disable_lsan; + _server->Start(4356, &options); + } + + TUniqueId fragment_id; + TQueryOptions query_options; + query_options.batch_size = 1; + RuntimeState state(fragment_id, query_options, TQueryGlobals(), _env); + state.init_mem_trackers(TUniqueId()); + + ObjectPool obj_pool; + TDescriptorTable tdesc_tbl; + auto t_data_sink = get_data_sink(&tdesc_tbl); + + // crate desc_tabl + DescriptorTbl* desc_tbl = nullptr; + auto st = DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + ASSERT_TRUE(st.ok()); + state._desc_tbl = desc_tbl; + + TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0); + LOG(INFO) << "tuple_desc=" << tuple_desc->debug_string(); + + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + service->_row_desc = &row_desc; + std::set output_set; + service->_output_set = &output_set; + + VOlapTableSink sink(&obj_pool, row_desc, {}, &st); + ASSERT_TRUE(st.ok()); + + // init + st = sink.init(t_data_sink); + ASSERT_TRUE(st.ok()); + // prepare + st = sink.prepare(&state); + ASSERT_TRUE(st.ok()); + // open + st = sink.open(&state); + ASSERT_TRUE(st.ok()); + + int slot_count = tuple_desc->slots().size(); + std::vector columns(slot_count); + for (int i = 0; i < slot_count; i++) { + columns[i] = tuple_desc->slots()[i]->get_empty_mutable_column(); + } + + int col_idx = 0; + auto* column_ptr = columns[col_idx++].get(); + auto column_vector_int = assert_cast*>(column_ptr); + int int_val = 12; + column_vector_int->insert_data((const char*)&int_val, 0); + int_val = 13; + column_vector_int->insert_data((const char*)&int_val, 0); + int_val = 14; + column_vector_int->insert_data((const char*)&int_val, 0); + + column_ptr = columns[col_idx++].get(); + auto column_vector_bigint = assert_cast*>(column_ptr); + int64_t int64_val = 9; + column_vector_bigint->insert_data((const char*)&int64_val, 0); + int64_val = 25; + column_vector_bigint->insert_data((const char*)&int64_val, 0); + int64_val = 50; + column_vector_bigint->insert_data((const char*)&int64_val, 0); + + column_ptr = columns[col_idx++].get(); + auto column_vector_str = assert_cast(column_ptr); + column_vector_str->insert_data("abc", 3); + column_vector_str->insert_data("abcd", 4); + column_vector_str->insert_data("abcde1234567890", 15); + + vectorized::Block block; + col_idx = 0; + for (const auto slot_desc : tuple_desc->slots()) { + block.insert(vectorized::ColumnWithTypeAndName( + std::move(columns[col_idx++]), + slot_desc->get_data_type_ptr(), + slot_desc->col_name())); + } + + // send + st = sink.send(&state, &block); + ASSERT_TRUE(st.ok()); + // close + st = sink.close(&state, Status::OK()); + ASSERT_TRUE(st.ok() || st.to_string() == "Internal error: wait close failed. ") + << st.to_string(); + + // each node has a eof + ASSERT_EQ(2, service->_eof_counters); + ASSERT_EQ(2 * 2, service->_row_counters); + + // 2node * 2 + ASSERT_EQ(1, state.num_rows_load_filtered()); +} + +TEST_F(VOlapTableSinkTest, convert) { + // start brpc service first + _server = new brpc::Server(); + auto service = new TestInternalService(); + ASSERT_EQ(_server->AddService(service, brpc::SERVER_OWNS_SERVICE), 0); + brpc::ServerOptions options; + { + debug::ScopedLeakCheckDisabler disable_lsan; + _server->Start(4356, &options); + } + + TUniqueId fragment_id; + TQueryOptions query_options; + query_options.batch_size = 1024; + RuntimeState state(fragment_id, query_options, TQueryGlobals(), _env); + state.init_mem_trackers(TUniqueId()); + + ObjectPool obj_pool; + TDescriptorTable tdesc_tbl; + auto t_data_sink = get_data_sink(&tdesc_tbl); + + // crate desc_tabl + DescriptorTbl* desc_tbl = nullptr; + auto st = DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + ASSERT_TRUE(st.ok()); + state._desc_tbl = desc_tbl; + + TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0); + + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + + // expr + std::vector exprs; + exprs.resize(3); + exprs[0].nodes.resize(1); + exprs[0].nodes[0].node_type = TExprNodeType::SLOT_REF; + exprs[0].nodes[0].type = tdesc_tbl.slotDescriptors[3].slotType; + exprs[0].nodes[0].num_children = 0; + exprs[0].nodes[0].__isset.slot_ref = true; + exprs[0].nodes[0].slot_ref.slot_id = 0; + exprs[0].nodes[0].slot_ref.tuple_id = 1; + + exprs[1].nodes.resize(1); + exprs[1].nodes[0].node_type = TExprNodeType::SLOT_REF; + exprs[1].nodes[0].type = tdesc_tbl.slotDescriptors[4].slotType; + exprs[1].nodes[0].num_children = 0; + exprs[1].nodes[0].__isset.slot_ref = true; + exprs[1].nodes[0].slot_ref.slot_id = 1; + exprs[1].nodes[0].slot_ref.tuple_id = 1; + + exprs[2].nodes.resize(1); + exprs[2].nodes[0].node_type = TExprNodeType::SLOT_REF; + exprs[2].nodes[0].type = tdesc_tbl.slotDescriptors[5].slotType; + exprs[2].nodes[0].num_children = 0; + exprs[2].nodes[0].__isset.slot_ref = true; + exprs[2].nodes[0].slot_ref.slot_id = 2; + exprs[2].nodes[0].slot_ref.tuple_id = 1; + + VOlapTableSink sink(&obj_pool, row_desc, exprs, &st); + ASSERT_TRUE(st.ok()); + + // set output tuple_id + t_data_sink.olap_table_sink.tuple_id = 1; + // init + st = sink.init(t_data_sink); + ASSERT_TRUE(st.ok()); + // prepare + st = sink.prepare(&state); + ASSERT_TRUE(st.ok()); + // open + st = sink.open(&state); + ASSERT_TRUE(st.ok()); + // send + int slot_count = tuple_desc->slots().size(); + std::vector columns(slot_count); + for (int i = 0; i < slot_count; i++) { + columns[i] = tuple_desc->slots()[i]->get_empty_mutable_column(); + } + + int col_idx = 0; + auto* column_ptr = columns[col_idx++].get(); + auto column_vector_int = assert_cast*>(column_ptr); + int int_val = 12; + column_vector_int->insert_data((const char*)&int_val, 0); + int_val = 13; + column_vector_int->insert_data((const char*)&int_val, 0); + int_val = 14; + column_vector_int->insert_data((const char*)&int_val, 0); + + column_ptr = columns[col_idx++].get(); + auto column_vector_bigint = assert_cast*>(column_ptr); + int64_t int64_val = 9; + column_vector_bigint->insert_data((const char*)&int64_val, 0); + int64_val = 25; + column_vector_bigint->insert_data((const char*)&int64_val, 0); + int64_val = 50; + column_vector_bigint->insert_data((const char*)&int64_val, 0); + + column_ptr = columns[col_idx++].get(); + auto column_vector_str = assert_cast(column_ptr); + column_vector_str->insert_data("abc", 3); + column_vector_str->insert_data("abcd", 4); + column_vector_str->insert_data("abcde", 15); + + vectorized::Block block; + col_idx = 0; + for (const auto slot_desc : tuple_desc->slots()) { + block.insert(vectorized::ColumnWithTypeAndName( + std::move(columns[col_idx++]), + slot_desc->get_data_type_ptr(), + slot_desc->col_name())); + } + st = sink.send(&state, &block); + ASSERT_TRUE(st.ok()); + // close + st = sink.close(&state, Status::OK()); + ASSERT_TRUE(st.ok() || st.to_string() == "Internal error: wait close failed. ") + << st.to_string(); + + // each node has a eof + ASSERT_EQ(2, service->_eof_counters); + ASSERT_EQ(2 * 3, service->_row_counters); + + // 2node * 2 + ASSERT_EQ(0, state.num_rows_load_filtered()); +} + +TEST_F(VOlapTableSinkTest, add_block_failed) { + // start brpc service first + _server = new brpc::Server(); + auto service = new TestInternalService(); + ASSERT_EQ(_server->AddService(service, brpc::SERVER_OWNS_SERVICE), 0); + brpc::ServerOptions options; + { + debug::ScopedLeakCheckDisabler disable_lsan; + _server->Start(4356, &options); + } + + // ObjectPool create before RuntimeState, simulate actual situation better. + ObjectPool obj_pool; + + TUniqueId fragment_id; + TQueryOptions query_options; + query_options.batch_size = 1; + RuntimeState state(fragment_id, query_options, TQueryGlobals(), _env); + state.init_mem_trackers(TUniqueId()); + + TDescriptorTable tdesc_tbl; + auto t_data_sink = get_data_sink(&tdesc_tbl); + + // crate desc_tabl + DescriptorTbl* desc_tbl = nullptr; + auto st = DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + ASSERT_TRUE(st.ok()); + state._desc_tbl = desc_tbl; + + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + + // expr + std::vector exprs; + exprs.resize(3); + exprs[0].nodes.resize(1); + exprs[0].nodes[0].node_type = TExprNodeType::SLOT_REF; + exprs[0].nodes[0].type = tdesc_tbl.slotDescriptors[3].slotType; + exprs[0].nodes[0].num_children = 0; + exprs[0].nodes[0].__isset.slot_ref = true; + exprs[0].nodes[0].slot_ref.slot_id = 0; + exprs[0].nodes[0].slot_ref.tuple_id = 1; + + exprs[1].nodes.resize(1); + exprs[1].nodes[0].node_type = TExprNodeType::SLOT_REF; + exprs[1].nodes[0].type = tdesc_tbl.slotDescriptors[4].slotType; + exprs[1].nodes[0].num_children = 0; + exprs[1].nodes[0].__isset.slot_ref = true; + exprs[1].nodes[0].slot_ref.slot_id = 1; + exprs[1].nodes[0].slot_ref.tuple_id = 1; + + exprs[2].nodes.resize(1); + exprs[2].nodes[0].node_type = TExprNodeType::SLOT_REF; + exprs[2].nodes[0].type = tdesc_tbl.slotDescriptors[5].slotType; + exprs[2].nodes[0].num_children = 0; + exprs[2].nodes[0].__isset.slot_ref = true; + exprs[2].nodes[0].slot_ref.slot_id = 2; + exprs[2].nodes[0].slot_ref.tuple_id = 1; + + VOlapTableSink sink(&obj_pool, row_desc, exprs, &st); + ASSERT_TRUE(st.ok()); + + // set output tuple_id + t_data_sink.olap_table_sink.tuple_id = 1; + // init + st = sink.init(t_data_sink); + ASSERT_TRUE(st.ok()); + st = sink.prepare(&state); + ASSERT_TRUE(st.ok()); + st = sink.open(&state); + ASSERT_TRUE(st.ok()); + // send + TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0); + + int slot_count = tuple_desc->slots().size(); + std::vector columns(slot_count); + for (int i = 0; i < slot_count; i++) { + columns[i] = tuple_desc->slots()[i]->get_empty_mutable_column(); + } + + int col_idx = 0; + auto* column_ptr = columns[col_idx++].get(); + auto column_vector_int = assert_cast*>(column_ptr); + int int_val = 12; + column_vector_int->insert_data((const char*)&int_val, 0); + + column_ptr = columns[col_idx++].get(); + auto column_vector_bigint = assert_cast*>(column_ptr); + int64_t int64_val = 9; + column_vector_bigint->insert_data((const char*)&int64_val, 0); + + column_ptr = columns[col_idx++].get(); + auto column_vector_str = assert_cast(column_ptr); + column_vector_str->insert_data("abc", 3); + + vectorized::Block block; + col_idx = 0; + for (const auto slot_desc : tuple_desc->slots()) { + block.insert(vectorized::ColumnWithTypeAndName( + std::move(columns[col_idx++]), + slot_desc->get_data_type_ptr(), + slot_desc->col_name())); + } + // Channels will be cancelled internally, coz brpc returns k_add_batch_status. + k_add_batch_status = Status::InternalError("dummy failed"); + st = sink.send(&state, &block); + ASSERT_TRUE(st.ok()); + + // Send batch multiple times, can make _cur_batch or _pending_batches(in channels) not empty. + // To ensure the order of releasing resource is OK. + sink.send(&state, &block); + sink.send(&state, &block); + + // close + st = sink.close(&state, Status::OK()); + ASSERT_FALSE(st.ok()); +} + +TEST_F(VOlapTableSinkTest, decimal) { + // start brpc service first + _server = new brpc::Server(); + auto service = new TestInternalService(); + ASSERT_EQ(_server->AddService(service, brpc::SERVER_OWNS_SERVICE), 0); + brpc::ServerOptions options; + { + debug::ScopedLeakCheckDisabler disable_lsan; + _server->Start(4356, &options); + } + + TUniqueId fragment_id; + TQueryOptions query_options; + query_options.batch_size = 1; + RuntimeState state(fragment_id, query_options, TQueryGlobals(), _env); + state.init_mem_trackers(TUniqueId()); + + ObjectPool obj_pool; + TDescriptorTable tdesc_tbl; + auto t_data_sink = get_decimal_sink(&tdesc_tbl); + + // crate desc_tabl + DescriptorTbl* desc_tbl = nullptr; + auto st = DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + ASSERT_TRUE(st.ok()); + state._desc_tbl = desc_tbl; + + TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0); + LOG(INFO) << "tuple_desc=" << tuple_desc->debug_string(); + + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + service->_row_desc = &row_desc; + std::set output_set; + service->_output_set = &output_set; + + VOlapTableSink sink(&obj_pool, row_desc, {}, &st); + ASSERT_TRUE(st.ok()); + + // init + st = sink.init(t_data_sink); + ASSERT_TRUE(st.ok()); + // prepare + st = sink.prepare(&state); + ASSERT_TRUE(st.ok()); + // open + st = sink.open(&state); + ASSERT_TRUE(st.ok()); + // send + int slot_count = tuple_desc->slots().size(); + std::vector columns(slot_count); + for (int i = 0; i < slot_count; i++) { + columns[i] = tuple_desc->slots()[i]->get_empty_mutable_column(); + } + + int col_idx = 0; + auto* column_ptr = columns[col_idx++].get(); + auto column_vector_int = assert_cast*>(column_ptr); + int int_val = 12; + column_vector_int->insert_data((const char*)&int_val, 0); + int_val = 13; + column_vector_int->insert_data((const char*)&int_val, 0); + int_val = 14; + column_vector_int->insert_data((const char*)&int_val, 0); + + column_ptr = columns[col_idx++].get(); + auto column_vector_dec = assert_cast*>(column_ptr); + DecimalV2Value dec_val(std::string("12.3")); + column_vector_dec->insert_data((const char*)&dec_val, 0); + dec_val = std::string("123.123456789"); + column_vector_dec->insert_data((const char*)&dec_val, 0); + dec_val = std::string("123456789123.1234"); + column_vector_dec->insert_data((const char*)&dec_val, 0); + + vectorized::Block block; + col_idx = 0; + for (const auto slot_desc : tuple_desc->slots()) { + block.insert(vectorized::ColumnWithTypeAndName( + std::move(columns[col_idx++]), + slot_desc->get_data_type_ptr(), + slot_desc->col_name())); + } + st = sink.send(&state, &block); + ASSERT_TRUE(st.ok()); + // close + st = sink.close(&state, Status::OK()); + ASSERT_TRUE(st.ok() || st.to_string() == "Internal error: wait close failed. ") + << st.to_string(); + + ASSERT_EQ(2, output_set.size()); + ASSERT_TRUE(output_set.count("(12, 12.3") > 0); + ASSERT_TRUE(output_set.count("(13, 123.12") > 0); +} +} // namespace stream_load +} // namespace doris + +int main(int argc, char** argv) { + doris::CpuInfo::init(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file From 10f294fb4fb8e19c4eb6835048471b1fb0d791c9 Mon Sep 17 00:00:00 2001 From: jacktengg Date: Fri, 11 Mar 2022 16:01:58 +0800 Subject: [PATCH 14/32] vectorized stream load: fix bugs --- be/src/olap/memtable.cpp | 28 +++++++++++++--------------- be/src/olap/memtable.h | 7 ++----- be/src/vec/core/block.cpp | 4 ++-- be/src/vec/core/block.h | 2 +- 4 files changed, 18 insertions(+), 23 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 2a96ad8d66a325..702ea5ed902897 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -44,10 +44,12 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _rowset_writer(rowset_writer), _is_first_insertion(true) { if (support_vec){ + _skip_list = nullptr; _vec_row_comparator = std::make_shared(_schema); _vec_skip_list = new VecTable(_vec_row_comparator.get(), _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS); }else{ + _vec_skip_list =nullptr; if (tablet_schema->sort_type() == SortType::ZORDER) { _row_comparator = std::make_shared(_schema, tablet_schema->sort_col_num()); @@ -60,7 +62,10 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet } MemTable::~MemTable() { - delete _skip_list; + if (_skip_list) + delete _skip_list; + if (_vec_skip_list) + delete _vec_skip_list; } MemTable::RowCursorComparator::RowCursorComparator(const Schema* schema) : _schema(schema) {} @@ -84,11 +89,12 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num _is_first_insertion = false; auto cloneBlock = block->clone_without_columns(); _input_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + _vec_row_comparator->set_block(&_input_mutable_block); _output_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); } size_t cursor_in_mutableblock = _input_mutable_block.rows(); size_t oldsize = block->allocated_bytes(); - _input_mutable_block.add_rows(block, row_pos, row_pos + num_rows); + _input_mutable_block.add_rows(block, row_pos, num_rows); size_t newsize = block->allocated_bytes(); _mem_tracker->Consume(newsize - oldsize); @@ -205,7 +211,7 @@ vectorized::Block MemTable::collect_skiplist_results() return _output_mutable_block.to_block(); } -OLAPStatus MemTable::vflush(){ +OLAPStatus MemTable::_vflush(){ VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id << ", memsize: " << memory_usage() << ", rows: " << _rows; size_t _flush_size = 0; @@ -224,19 +230,11 @@ OLAPStatus MemTable::vflush(){ return OLAP_SUCCESS; } - -vectorized::Block MemTable::flush_to_block(){ - - return collect_skiplist_results(); - -} - - -OLAPStatus MemTable::vclose() { - return vflush(); -} - OLAPStatus MemTable::flush() { + if (_vec_skip_list) { + return _vflush(); + } + VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id << ", memsize: " << memory_usage() << ", rows: " << _rows; int64_t duration_ns = 0; diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index f60db0a0a7075a..2e7f7c95f36e81 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -60,13 +60,10 @@ class MemTable { int64_t flush_size() const { return _flush_size; } +private: //flush for vectorized - OLAPStatus vflush(); - OLAPStatus vclose(); + OLAPStatus _vflush(); - //for test - vectorized::Block flush_to_block(); -private: class RowCursorComparator : public RowComparator { public: RowCursorComparator(const Schema* schema); diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index 1a670bd819f2ed..71228e2caf73e3 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -853,12 +853,12 @@ void MutableBlock::add_rows(const Block* block, const int* row_begin, const int* } } -void MutableBlock::add_rows(const Block* block, size_t row_begin, size_t row_end) { +void MutableBlock::add_rows(const Block* block, size_t row_begin, size_t length) { auto& block_data = block->get_columns_with_type_and_name(); for (size_t i = 0; i < _columns.size(); ++i) { auto& dst = _columns[i]; auto& src = *block_data[i].column.get(); - dst->insert_range_from(src, row_begin, row_end); + dst->insert_range_from(src, row_begin, length); } } diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index dca1a73fbc669c..d122ca0327cc64 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -415,7 +415,7 @@ class MutableBlock { void add_row(const Block* block, int row); void add_rows(const Block* block, const int* row_begin, const int* row_end); - void add_rows(const Block* block, size_t row_begin, size_t row_end); + void add_rows(const Block* block, size_t row_begin, size_t length); std::string dump_data(size_t row_limit = 100) const; From 074599c76bfbce33f60be8fb4a7b1e972f997e3f Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Mon, 14 Mar 2022 14:07:50 +0800 Subject: [PATCH 15/32] RowCursorCell support vectorization --- be/src/olap/memtable.h | 4 ++- be/src/olap/row_cursor_cell.h | 62 ++++++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 2e7f7c95f36e81..04e3f969843470 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -79,7 +79,9 @@ class MemTable { RowInBlock(size_t i):_row_pos(i){} RowCursorCell cell(vectorized::MutableBlock* block, int cid){ StringRef ref = block->mutable_columns()[cid]->get_data_at(_row_pos); - return RowCursorCell(ref.data); + bool is_null = block->mutable_columns()[cid]->is_null_at(_row_pos); + NullState null_state = is_null ? NullState::IS_NULL : NullState::NOT_NULL; + return RowCursorCell(ref.data, null_state); } }; class RowInBlockComparator { diff --git a/be/src/olap/row_cursor_cell.h b/be/src/olap/row_cursor_cell.h index ffe78b030d5acd..faaa65b33bd595 100644 --- a/be/src/olap/row_cursor_cell.h +++ b/be/src/olap/row_cursor_cell.h @@ -19,18 +19,62 @@ namespace doris { +enum class NullState { + UNKNOWN = 0, + IS_NULL = 1, + NOT_NULL = 2 +}; struct RowCursorCell { - RowCursorCell(void* ptr) : _ptr(ptr) {} - RowCursorCell(const void* ptr) : _ptr((void*)ptr) {} - bool is_null() const { return *reinterpret_cast(_ptr); } - void set_is_null(bool is_null) const { *reinterpret_cast(_ptr) = is_null; } - void set_null() const { *reinterpret_cast(_ptr) = true; } - void set_not_null() const { *reinterpret_cast(_ptr) = false; } - const void* cell_ptr() const { return (char*)_ptr + 1; } - void* mutable_cell_ptr() const { return (char*)_ptr + 1; } - + + RowCursorCell(void* ptr) : _ptr(ptr), _null_state(NullState::UNKNOWN) {} + RowCursorCell(const void* ptr) : _ptr((void*)ptr), _null_state(NullState::UNKNOWN) {} + RowCursorCell(void* ptr, NullState null_state) : _ptr((void*)ptr), _null_state(null_state) {} + RowCursorCell(const void* ptr, NullState null_state) : _ptr((void*)ptr), _null_state(null_state) {} + bool is_null() const { + return _null_state == NullState::UNKNOWN ? *reinterpret_cast(_ptr) : _null_state == NullState::IS_NULL; + } + void set_is_null(bool is_null) { + if (_null_state == NullState::UNKNOWN) + *reinterpret_cast(_ptr) = is_null; + else{ + _null_state = (is_null ? NullState::IS_NULL : NullState::NOT_NULL); + } + } + void set_null(){ + if (_null_state == NullState::UNKNOWN){ + *reinterpret_cast(_ptr) = true; + }else{ + _null_state = NullState::IS_NULL; + } + } + void set_not_null(){ + if (_null_state == NullState::UNKNOWN){ + *reinterpret_cast(_ptr) = false; + }else{ + _null_state = NullState::IS_NULL; + } + } + const void* cell_ptr() const { + if (_null_state == NullState::UNKNOWN){ + return (char*)_ptr + 1; + }else{ + return (char*)_ptr; + } + } + void* mutable_cell_ptr() const { + if (_null_state == NullState::UNKNOWN){ + return (char*)_ptr + 1; + }else{ + return (char*)_ptr; + } + } private: void* _ptr; + /** + * @brief if _null_state is UNKNOWN, the null flag is the first char of ptr + * + */ + NullState _null_state; }; } // namespace doris From 24df7142940bf843df0c6a77df9a2fd103bb45ad Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Tue, 15 Mar 2022 10:57:20 +0800 Subject: [PATCH 16/32] Add test case for VdeltaWriter and VLoadChannelMgr --- be/test/vec/olap/CMakeLists.txt | 22 + be/test/vec/olap/vdelta_writer_test.cpp | 635 +++++++++++++++ be/test/vec/runtime/CMakeLists.txt | 1 + .../vec/runtime/vload_channel_mgr_test.cpp | 763 ++++++++++++++++++ 4 files changed, 1421 insertions(+) create mode 100644 be/test/vec/olap/CMakeLists.txt create mode 100644 be/test/vec/olap/vdelta_writer_test.cpp create mode 100644 be/test/vec/runtime/vload_channel_mgr_test.cpp diff --git a/be/test/vec/olap/CMakeLists.txt b/be/test/vec/olap/CMakeLists.txt new file mode 100644 index 00000000000000..ad44f0a5d246c3 --- /dev/null +++ b/be/test/vec/olap/CMakeLists.txt @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# where to put generated libraries +set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/vec/olap") + +ADD_BE_TEST(vdelta_writer_test) + diff --git a/be/test/vec/olap/vdelta_writer_test.cpp b/be/test/vec/olap/vdelta_writer_test.cpp new file mode 100644 index 00000000000000..cbd837ce43f001 --- /dev/null +++ b/be/test/vec/olap/vdelta_writer_test.cpp @@ -0,0 +1,635 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/olap/vdelta_writer.h" + +#include +#include + +#include + +#include "gen_cpp/Descriptors_types.h" +#include "gen_cpp/PaloInternalService_types.h" +#include "gen_cpp/Types_types.h" +#include "olap/field.h" +#include "olap/options.h" +#include "olap/storage_engine.h" +#include "olap/tablet.h" +#include "olap/tablet_meta_manager.h" +#include "olap/utils.h" +#include "runtime/descriptor_helper.h" +#include "runtime/exec_env.h" +#include "runtime/mem_pool.h" +#include "runtime/mem_tracker.h" +#include "runtime/tuple.h" +#include "util/file_utils.h" +#include "util/logging.h" + +namespace doris { + +// This is DeltaWriter unit test which used by streaming load. +// And also it should take schema change into account after streaming load. + +static const uint32_t MAX_PATH_LEN = 1024; + +StorageEngine* k_engine = nullptr; +std::shared_ptr k_mem_tracker = nullptr; + +void set_up() { + char buffer[MAX_PATH_LEN]; + getcwd(buffer, MAX_PATH_LEN); + config::storage_root_path = std::string(buffer) + "/data_test"; + FileUtils::remove_all(config::storage_root_path); + FileUtils::create_dir(config::storage_root_path); + std::vector paths; + paths.emplace_back(config::storage_root_path, -1); + + doris::EngineOptions options; + options.store_paths = paths; + Status s = doris::StorageEngine::open(options, &k_engine); + ASSERT_TRUE(s.ok()) << s.to_string(); + + ExecEnv* exec_env = doris::ExecEnv::GetInstance(); + exec_env->set_storage_engine(k_engine); + k_engine->start_bg_threads(); + k_mem_tracker.reset(new MemTracker(-1, "delta writer test")); +} + +void tear_down() { + if (k_engine != nullptr) { + k_engine->stop(); + delete k_engine; + k_engine = nullptr; + } + system("rm -rf ./data_test"); + FileUtils::remove_all(std::string(getenv("DORIS_HOME")) + UNUSED_PREFIX); +} + +void create_tablet_request(int64_t tablet_id, int32_t schema_hash, TCreateTabletReq* request) { + request->tablet_id = tablet_id; + request->__set_version(1); + request->tablet_schema.schema_hash = schema_hash; + request->tablet_schema.short_key_column_count = 6; + request->tablet_schema.keys_type = TKeysType::AGG_KEYS; + request->tablet_schema.storage_type = TStorageType::COLUMN; + + TColumn k1; + k1.column_name = "k1"; + k1.__set_is_key(true); + k1.column_type.type = TPrimitiveType::TINYINT; + request->tablet_schema.columns.push_back(k1); + + TColumn k2; + k2.column_name = "k2"; + k2.__set_is_key(true); + k2.column_type.type = TPrimitiveType::SMALLINT; + request->tablet_schema.columns.push_back(k2); + + TColumn k3; + k3.column_name = "k3"; + k3.__set_is_key(true); + k3.column_type.type = TPrimitiveType::INT; + request->tablet_schema.columns.push_back(k3); + + TColumn k4; + k4.column_name = "k4"; + k4.__set_is_key(true); + k4.column_type.type = TPrimitiveType::BIGINT; + request->tablet_schema.columns.push_back(k4); + + TColumn k5; + k5.column_name = "k5"; + k5.__set_is_key(true); + k5.column_type.type = TPrimitiveType::LARGEINT; + request->tablet_schema.columns.push_back(k5); + + TColumn k6; + k6.column_name = "k6"; + k6.__set_is_key(true); + k6.column_type.type = TPrimitiveType::DATE; + request->tablet_schema.columns.push_back(k6); + + TColumn k7; + k7.column_name = "k7"; + k7.__set_is_key(true); + k7.column_type.type = TPrimitiveType::DATETIME; + request->tablet_schema.columns.push_back(k7); + + TColumn k8; + k8.column_name = "k8"; + k8.__set_is_key(true); + k8.column_type.type = TPrimitiveType::CHAR; + k8.column_type.__set_len(4); + request->tablet_schema.columns.push_back(k8); + + TColumn k9; + k9.column_name = "k9"; + k9.__set_is_key(true); + k9.column_type.type = TPrimitiveType::VARCHAR; + k9.column_type.__set_len(65); + request->tablet_schema.columns.push_back(k9); + + TColumn k10; + k10.column_name = "k10"; + k10.__set_is_key(true); + k10.column_type.type = TPrimitiveType::DECIMALV2; + k10.column_type.__set_precision(6); + k10.column_type.__set_scale(3); + request->tablet_schema.columns.push_back(k10); + + TColumn v1; + v1.column_name = "v1"; + v1.__set_is_key(false); + v1.column_type.type = TPrimitiveType::TINYINT; + v1.__set_aggregation_type(TAggregationType::SUM); + request->tablet_schema.columns.push_back(v1); + + TColumn v2; + v2.column_name = "v2"; + v2.__set_is_key(false); + v2.column_type.type = TPrimitiveType::SMALLINT; + v2.__set_aggregation_type(TAggregationType::SUM); + request->tablet_schema.columns.push_back(v2); + + TColumn v3; + v3.column_name = "v3"; + v3.__set_is_key(false); + v3.column_type.type = TPrimitiveType::INT; + v3.__set_aggregation_type(TAggregationType::SUM); + request->tablet_schema.columns.push_back(v3); + + TColumn v4; + v4.column_name = "v4"; + v4.__set_is_key(false); + v4.column_type.type = TPrimitiveType::BIGINT; + v4.__set_aggregation_type(TAggregationType::SUM); + request->tablet_schema.columns.push_back(v4); + + TColumn v5; + v5.column_name = "v5"; + v5.__set_is_key(false); + v5.column_type.type = TPrimitiveType::LARGEINT; + v5.__set_aggregation_type(TAggregationType::SUM); + request->tablet_schema.columns.push_back(v5); + + TColumn v6; + v6.column_name = "v6"; + v6.__set_is_key(false); + v6.column_type.type = TPrimitiveType::DATE; + v6.__set_aggregation_type(TAggregationType::REPLACE); + request->tablet_schema.columns.push_back(v6); + + TColumn v7; + v7.column_name = "v7"; + v7.__set_is_key(false); + v7.column_type.type = TPrimitiveType::DATETIME; + v7.__set_aggregation_type(TAggregationType::REPLACE); + request->tablet_schema.columns.push_back(v7); + + TColumn v8; + v8.column_name = "v8"; + v8.__set_is_key(false); + v8.column_type.type = TPrimitiveType::CHAR; + v8.column_type.__set_len(4); + v8.__set_aggregation_type(TAggregationType::REPLACE); + request->tablet_schema.columns.push_back(v8); + + TColumn v9; + v9.column_name = "v9"; + v9.__set_is_key(false); + v9.column_type.type = TPrimitiveType::VARCHAR; + v9.column_type.__set_len(65); + v9.__set_aggregation_type(TAggregationType::REPLACE); + request->tablet_schema.columns.push_back(v9); + + TColumn v10; + v10.column_name = "v10"; + v10.__set_is_key(false); + v10.column_type.type = TPrimitiveType::DECIMALV2; + v10.column_type.__set_precision(6); + v10.column_type.__set_scale(3); + v10.__set_aggregation_type(TAggregationType::SUM); + request->tablet_schema.columns.push_back(v10); +} + +void create_tablet_request_with_sequence_col(int64_t tablet_id, int32_t schema_hash, + TCreateTabletReq* request) { + request->tablet_id = tablet_id; + request->__set_version(1); + request->tablet_schema.schema_hash = schema_hash; + request->tablet_schema.short_key_column_count = 2; + request->tablet_schema.keys_type = TKeysType::UNIQUE_KEYS; + request->tablet_schema.storage_type = TStorageType::COLUMN; + request->tablet_schema.__set_sequence_col_idx(2); + + TColumn k1; + k1.column_name = "k1"; + k1.__set_is_key(true); + k1.column_type.type = TPrimitiveType::TINYINT; + request->tablet_schema.columns.push_back(k1); + + TColumn k2; + k2.column_name = "k2"; + k2.__set_is_key(true); + k2.column_type.type = TPrimitiveType::SMALLINT; + request->tablet_schema.columns.push_back(k2); + + TColumn sequence_col; + sequence_col.column_name = SEQUENCE_COL; + sequence_col.__set_is_key(false); + sequence_col.column_type.type = TPrimitiveType::INT; + sequence_col.__set_aggregation_type(TAggregationType::REPLACE); + request->tablet_schema.columns.push_back(sequence_col); + + TColumn v1; + v1.column_name = "v1"; + v1.__set_is_key(false); + v1.column_type.type = TPrimitiveType::DATETIME; + v1.__set_aggregation_type(TAggregationType::REPLACE); + request->tablet_schema.columns.push_back(v1); +} + +TDescriptorTable create_descriptor_tablet() { + TDescriptorTableBuilder dtb; + TTupleDescriptorBuilder tuple_builder; + + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_TINYINT).column_name("k1").column_pos(0).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_SMALLINT).column_name("k2").column_pos(1).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_INT).column_name("k3").column_pos(2).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_BIGINT).column_name("k4").column_pos(3).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_LARGEINT).column_name("k5").column_pos(4).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_DATE).column_name("k6").column_pos(5).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_DATETIME).column_name("k7").column_pos(6).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(4).column_name("k8").column_pos(7).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65).column_name("k9").column_pos(8).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().decimal_type(6, 3).column_name("k10").column_pos(9).build()); + + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_TINYINT).column_name("v1").column_pos(10).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_SMALLINT).column_name("v2").column_pos(11).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_INT).column_name("v3").column_pos(12).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_BIGINT).column_name("v4").column_pos(13).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_LARGEINT).column_name("v5").column_pos(14).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_DATE).column_name("v6").column_pos(15).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_DATETIME).column_name("v7").column_pos(16).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(4).column_name("v8").column_pos(17).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65).column_name("v9").column_pos(18).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().decimal_type(6, 3).column_name("v10").column_pos(19).build()); + tuple_builder.build(&dtb); + + return dtb.desc_tbl(); +} + +TDescriptorTable create_descriptor_tablet_with_sequence_col() { + TDescriptorTableBuilder dtb; + TTupleDescriptorBuilder tuple_builder; + + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_TINYINT).column_name("k1").column_pos(0).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_SMALLINT).column_name("k2").column_pos(1).build()); + tuple_builder.add_slot(TSlotDescriptorBuilder() + .type(TYPE_INT) + .column_name(SEQUENCE_COL) + .column_pos(2) + .build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_DATETIME).column_name("v1").column_pos(3).build()); + tuple_builder.build(&dtb); + + return dtb.desc_tbl(); +} + +class VTestDeltaWriter : public ::testing::Test { +public: + VTestDeltaWriter() {} + ~VTestDeltaWriter() {} + + void SetUp() { + // Create local data dir for StorageEngine. + std::cout << "setup" << std::endl; + } + + void TearDown() { + // Remove all dir. + std::cout << "tear down" << std::endl; + //doris::tear_down(); + //ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); + } +}; + +TEST_F(VTestDeltaWriter, open) { + TCreateTabletReq request; + create_tablet_request(10003, 270068375, &request); + OLAPStatus res = k_engine->create_tablet(request); + ASSERT_EQ(OLAP_SUCCESS, res); + + TDescriptorTable tdesc_tbl = create_descriptor_tablet(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0); + + PUniqueId load_id; + load_id.set_hi(0); + load_id.set_lo(0); + WriteRequest write_req = {10003, 270068375, WriteType::LOAD, 20001, + 30001, load_id, false, tuple_desc}; + vectorized::VDeltaWriter* delta_writer = nullptr; + vectorized::VDeltaWriter::open(&write_req, k_mem_tracker, &delta_writer); + ASSERT_NE(delta_writer, nullptr); + res = delta_writer->close(); + ASSERT_EQ(OLAP_SUCCESS, res); + res = delta_writer->close_wait(nullptr, false); + ASSERT_EQ(OLAP_SUCCESS, res); + SAFE_DELETE(delta_writer); + + TDropTabletReq drop_request; + auto tablet_id = 10003; + auto schema_hash = 270068375; + res = k_engine->tablet_manager()->drop_tablet(tablet_id, schema_hash); + ASSERT_EQ(OLAP_SUCCESS, res); +} + +TEST_F(VTestDeltaWriter, write) { + TCreateTabletReq request; + create_tablet_request(10004, 270068376, &request); + OLAPStatus res = k_engine->create_tablet(request); + ASSERT_EQ(OLAP_SUCCESS, res); + + TDescriptorTable tdesc_tbl = create_descriptor_tablet(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0); +// const std::vector& slots = tuple_desc->slots(); + + PUniqueId load_id; + load_id.set_hi(0); + load_id.set_lo(0); + WriteRequest write_req = {10004, 270068376, WriteType::LOAD, 20002, 30002, load_id, + false, tuple_desc, &(tuple_desc->slots())}; + vectorized::VDeltaWriter* delta_writer = nullptr; + vectorized::VDeltaWriter::open(&write_req, k_mem_tracker, &delta_writer); + ASSERT_NE(delta_writer, nullptr); + + auto tracker = std::make_shared(); + MemPool pool(tracker.get()); + + vectorized::Block block; + for (const auto& slot_desc : tuple_desc->slots()) { + block.insert(vectorized::ColumnWithTypeAndName(slot_desc->get_empty_mutable_column(), + slot_desc->get_data_type_ptr(), + slot_desc->col_name())); + } + + auto columns = block.mutate_columns(); + { + int8_t k1 = -127; + columns[0]->insert_data((const char*)&k1, sizeof(k1)); + + int16_t k2 = -32767; + columns[1]->insert_data((const char*)&k2, sizeof(k2)); + + int32_t k3 = -2147483647; + columns[2]->insert_data((const char*)&k3, sizeof(k3)); + + int64_t k4 = -9223372036854775807L; + columns[3]->insert_data((const char*)&k4, sizeof(k4)); + + int128_t k5 = -90000; + columns[4]->insert_data((const char*)&k5, sizeof(k5)); + + DateTimeValue k6; + k6.from_date_str("2048-11-10", 10); + auto k6_int = k6.to_int64(); + columns[5]->insert_data((const char*)&k6_int, sizeof(k6_int)); + + DateTimeValue k7; + k7.from_date_str("2636-08-16 19:39:43", 19); + auto k7_int = k7.to_int64(); + columns[6]->insert_data((const char*)&k7_int, sizeof(k7_int)); + + columns[7]->insert_data("abcd", 4); + columns[8]->insert_data("abcde", 5); + + DecimalV2Value decimal_value; + decimal_value.assign_from_double(1.1); + columns[9]->insert_data((const char*)&decimal_value, sizeof(decimal_value)); + + int8_t v1 = -127; + columns[10]->insert_data((const char*)&v1, sizeof(v1)); + + int16_t v2 = -32767; + columns[11]->insert_data((const char*)&v2, sizeof(v2)); + + int32_t v3 = -2147483647; + columns[12]->insert_data((const char*)&v3, sizeof(v3)); + + int64_t v4 = -9223372036854775807L; + columns[13]->insert_data((const char*)&v4, sizeof(v4)); + + int128_t v5 = -90000; + columns[14]->insert_data((const char*)&v5, sizeof(v5)); + + DateTimeValue v6; + v6.from_date_str("2048-11-10", 10); + auto v6_int = v6.to_int64(); + columns[15]->insert_data((const char*)&v6_int, sizeof(v6_int)); + + DateTimeValue v7; + v7.from_date_str("2636-08-16 19:39:43", 19); + auto v7_int = v7.to_int64(); + columns[16]->insert_data((const char*)&v7_int, sizeof(v7_int)); + + columns[17]->insert_data("abcd", 4); + columns[18]->insert_data("abcde", 5); + + decimal_value.assign_from_double(1.1); + columns[19]->insert_data((const char*)&decimal_value, sizeof(decimal_value)); + + res = delta_writer->write_block(&block, {0}); + ASSERT_EQ(OLAP_SUCCESS, res); + } + + res = delta_writer->close(); + ASSERT_EQ(OLAP_SUCCESS, res); + res = delta_writer->close_wait(nullptr, false); + ASSERT_EQ(OLAP_SUCCESS, res); + + // publish version success + TabletSharedPtr tablet = + k_engine->tablet_manager()->get_tablet(write_req.tablet_id, write_req.schema_hash); + std::cout << "before publish, tablet row nums:" << tablet->num_rows() << std::endl; + OlapMeta* meta = tablet->data_dir()->get_meta(); + Version version; + version.first = tablet->rowset_with_max_version()->end_version() + 1; + version.second = tablet->rowset_with_max_version()->end_version() + 1; + std::cout << "start to add rowset version:" << version.first << "-" << version.second + << std::endl; + std::map tablet_related_rs; + StorageEngine::instance()->txn_manager()->get_txn_related_tablets( + write_req.txn_id, write_req.partition_id, &tablet_related_rs); + for (auto& tablet_rs : tablet_related_rs) { + std::cout << "start to publish txn" << std::endl; + RowsetSharedPtr rowset = tablet_rs.second; + res = k_engine->txn_manager()->publish_txn(meta, write_req.partition_id, write_req.txn_id, + write_req.tablet_id, write_req.schema_hash, + tablet_rs.first.tablet_uid, version); + ASSERT_EQ(OLAP_SUCCESS, res); + std::cout << "start to add inc rowset:" << rowset->rowset_id() + << ", num rows:" << rowset->num_rows() << ", version:" << rowset->version().first + << "-" << rowset->version().second << std::endl; + res = tablet->add_inc_rowset(rowset); + ASSERT_EQ(OLAP_SUCCESS, res); + } + ASSERT_EQ(1, tablet->num_rows()); + + auto tablet_id = 10003; + auto schema_hash = 270068375; + res = k_engine->tablet_manager()->drop_tablet(tablet_id, schema_hash); + ASSERT_EQ(OLAP_SUCCESS, res); + delete delta_writer; +} + +TEST_F(VTestDeltaWriter, sequence_col) { + TCreateTabletReq request; + create_tablet_request_with_sequence_col(10005, 270068377, &request); + OLAPStatus res = k_engine->create_tablet(request); + ASSERT_EQ(OLAP_SUCCESS, res); + + TDescriptorTable tdesc_tbl = create_descriptor_tablet_with_sequence_col(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0); + + PUniqueId load_id; + load_id.set_hi(0); + load_id.set_lo(0); + WriteRequest write_req = {10005, 270068377, WriteType::LOAD, 20003, 30003, load_id, + false, tuple_desc, &(tuple_desc->slots())}; + vectorized::VDeltaWriter* delta_writer = nullptr; + vectorized::VDeltaWriter::open(&write_req, k_mem_tracker, &delta_writer); + ASSERT_NE(delta_writer, nullptr); + + MemTracker tracker; + MemPool pool(&tracker); + + vectorized::Block block; + for (const auto& slot_desc : tuple_desc->slots()) { + block.insert(vectorized::ColumnWithTypeAndName(slot_desc->get_empty_mutable_column(), + slot_desc->get_data_type_ptr(), + slot_desc->col_name())); + } + + auto columns = block.mutate_columns(); + { + int8_t c1 = 123; + columns[0]->insert_data((const char*)&c1, sizeof(c1)); + + int16_t c2 = 456; + columns[1]->insert_data((const char*)&c2, sizeof(c2)); + + int32_t c3 = 1; + columns[2]->insert_data((const char*)&c3, sizeof(c2)); + + DateTimeValue c4; + c4.from_date_str("2020-07-16 19:39:43", 19); + int64_t c4_int = c4.to_int64(); + columns[3]->insert_data((const char*)&c4_int, sizeof(c4)); + + res = delta_writer->write_block(&block, {0}); + ASSERT_EQ(OLAP_SUCCESS, res); + } + + res = delta_writer->close(); + ASSERT_EQ(OLAP_SUCCESS, res); + res = delta_writer->close_wait(nullptr, false); + ASSERT_EQ(OLAP_SUCCESS, res); + + // publish version success + TabletSharedPtr tablet = + k_engine->tablet_manager()->get_tablet(write_req.tablet_id, write_req.schema_hash); + std::cout << "before publish, tablet row nums:" << tablet->num_rows() << std::endl; + OlapMeta* meta = tablet->data_dir()->get_meta(); + Version version; + version.first = tablet->rowset_with_max_version()->end_version() + 1; + version.second = tablet->rowset_with_max_version()->end_version() + 1; + std::cout << "start to add rowset version:" << version.first << "-" << version.second + << std::endl; + std::map tablet_related_rs; + StorageEngine::instance()->txn_manager()->get_txn_related_tablets( + write_req.txn_id, write_req.partition_id, &tablet_related_rs); + for (auto& tablet_rs : tablet_related_rs) { + std::cout << "start to publish txn" << std::endl; + RowsetSharedPtr rowset = tablet_rs.second; + res = k_engine->txn_manager()->publish_txn(meta, write_req.partition_id, write_req.txn_id, + write_req.tablet_id, write_req.schema_hash, + tablet_rs.first.tablet_uid, version); + ASSERT_EQ(OLAP_SUCCESS, res); + std::cout << "start to add inc rowset:" << rowset->rowset_id() + << ", num rows:" << rowset->num_rows() << ", version:" << rowset->version().first + << "-" << rowset->version().second << std::endl; + res = tablet->add_inc_rowset(rowset); + ASSERT_EQ(OLAP_SUCCESS, res); + } + ASSERT_EQ(1, tablet->num_rows()); + + auto tablet_id = 10005; + auto schema_hash = 270068377; + res = k_engine->tablet_manager()->drop_tablet(tablet_id, schema_hash); + ASSERT_EQ(OLAP_SUCCESS, res); + delete delta_writer; +} + +} // namespace doris + +int main(int argc, char** argv) { + std::string conffile = std::string(getenv("DORIS_HOME")) + "/conf/be.conf"; + if (!doris::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + int ret = doris::OLAP_SUCCESS; + testing::InitGoogleTest(&argc, argv); + doris::CpuInfo::init(); + doris::set_up(); + ret = RUN_ALL_TESTS(); + doris::tear_down(); + google::protobuf::ShutdownProtobufLibrary(); + return ret; +} diff --git a/be/test/vec/runtime/CMakeLists.txt b/be/test/vec/runtime/CMakeLists.txt index f01816299ddc29..8ce2326b6cb384 100644 --- a/be/test/vec/runtime/CMakeLists.txt +++ b/be/test/vec/runtime/CMakeLists.txt @@ -19,4 +19,5 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/vec/runtime") ADD_BE_TEST(vdata_stream_test) +ADD_BE_TEST(vload_channel_mgr_test) diff --git a/be/test/vec/runtime/vload_channel_mgr_test.cpp b/be/test/vec/runtime/vload_channel_mgr_test.cpp new file mode 100644 index 00000000000000..71cfceef781680 --- /dev/null +++ b/be/test/vec/runtime/vload_channel_mgr_test.cpp @@ -0,0 +1,763 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/runtime/vload_channel_mgr.h" + +#include + +#include "common/object_pool.h" +#include "gen_cpp/Descriptors_types.h" +#include "gen_cpp/PaloInternalService_types.h" +#include "gen_cpp/Types_types.h" +#include "vec/olap/vdelta_writer.h" +#include "olap/memtable_flush_executor.h" +#include "olap/schema.h" +#include "olap/storage_engine.h" +#include "runtime/descriptor_helper.h" +#include "runtime/descriptors.h" +#include "runtime/exec_env.h" +#include "runtime/mem_tracker.h" +#include "runtime/primitive_type.h" +#include "runtime/row_batch.h" +#include "runtime/tuple_row.h" +#include "util/thrift_util.h" + +namespace doris { + +std::unordered_map _k_tablet_recorder; +OLAPStatus open_status; +OLAPStatus add_status; +OLAPStatus close_status; +int64_t wait_lock_time_ns; + +// mock +DeltaWriter::DeltaWriter(WriteRequest* req, const std::shared_ptr& mem_tracker, + StorageEngine* storage_engine) + : _req(*req) {} + +DeltaWriter::~DeltaWriter() {} + +OLAPStatus DeltaWriter::init() { + return OLAP_SUCCESS; +} + +OLAPStatus DeltaWriter::open(WriteRequest* req, const std::shared_ptr& mem_tracker, + DeltaWriter** writer) { + if (open_status != OLAP_SUCCESS) { + return open_status; + } + *writer = new DeltaWriter(req, mem_tracker, nullptr); + return open_status; +} + +OLAPStatus DeltaWriter::write(Tuple* tuple) { + if (_k_tablet_recorder.find(_req.tablet_id) == std::end(_k_tablet_recorder)) { + _k_tablet_recorder[_req.tablet_id] = 1; + } else { + _k_tablet_recorder[_req.tablet_id]++; + } + return add_status; +} + +OLAPStatus DeltaWriter::write(const RowBatch* row_batch, const std::vector& row_idxs) { + if (_k_tablet_recorder.find(_req.tablet_id) == std::end(_k_tablet_recorder)) { + _k_tablet_recorder[_req.tablet_id] = 0; + } + _k_tablet_recorder[_req.tablet_id] += row_idxs.size(); + return add_status; +} + +OLAPStatus DeltaWriter::close() { + return OLAP_SUCCESS; +} + +OLAPStatus DeltaWriter::close_wait(google::protobuf::RepeatedPtrField* tablet_vec, bool is_broken) { + return close_status; +} + +OLAPStatus DeltaWriter::cancel() { + return OLAP_SUCCESS; +} + +OLAPStatus DeltaWriter::flush_memtable_and_wait(bool need_wait) { + return OLAP_SUCCESS; +} + +OLAPStatus DeltaWriter::wait_flush() { + return OLAP_SUCCESS; +} + +int64_t DeltaWriter::partition_id() const { + return 1L; +} +int64_t DeltaWriter::mem_consumption() const { + return 1024L; +} + +namespace vectorized { + +VDeltaWriter::VDeltaWriter(WriteRequest* req, const std::shared_ptr& parent, + StorageEngine* storage_engine) + : DeltaWriter(req, parent, storage_engine) {} + +VDeltaWriter::~VDeltaWriter() {} + +OLAPStatus VDeltaWriter::open(WriteRequest* req, const std::shared_ptr& mem_tracker, + VDeltaWriter** writer) { + if (open_status != OLAP_SUCCESS) { + return open_status; + } + *writer = new VDeltaWriter(req, mem_tracker, nullptr); + return open_status; +} + +OLAPStatus VDeltaWriter::write(const Block* block, const std::vector& row_idxs) { + if (_k_tablet_recorder.find(_req.tablet_id) == std::end(_k_tablet_recorder)) { + _k_tablet_recorder[_req.tablet_id] = 0; + } + _k_tablet_recorder[_req.tablet_id] += row_idxs.size(); + return add_status; +} + +} + +class VLoadChannelMgrTest : public testing::Test { +public: + VLoadChannelMgrTest() {} + virtual ~VLoadChannelMgrTest() {} + void SetUp() override { + _k_tablet_recorder.clear(); + open_status = OLAP_SUCCESS; + add_status = OLAP_SUCCESS; + close_status = OLAP_SUCCESS; + config::streaming_load_rpc_max_alive_time_sec = 120; + } + +private: + + size_t uncompressed_size = 0; + size_t compressed_size = 0; +}; + +TDescriptorTable create_descriptor_table() { + TDescriptorTableBuilder dtb; + TTupleDescriptorBuilder tuple_builder; + + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_INT).column_name("c1").column_pos(0).build()); + tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_BIGINT).column_name("c2").column_pos(1).build()); + tuple_builder.build(&dtb); + + return dtb.desc_tbl(); +} + +Schema create_schema() { + std::vector col_schemas; + //c1 + TabletColumn c1(OLAP_FIELD_AGGREGATION_NONE, OLAP_FIELD_TYPE_INT, true); + c1.set_name("c1"); + + col_schemas.emplace_back(std::move(c1)); + // c2: int + TabletColumn c2(OLAP_FIELD_AGGREGATION_NONE, OLAP_FIELD_TYPE_BIGINT, true); + c2.set_name("c2"); + col_schemas.emplace_back(std::move(c2)); + + Schema schema(col_schemas, 2); + return schema; +} + +void create_schema(DescriptorTbl* desc_tbl, POlapTableSchemaParam* pschema) { + pschema->set_db_id(1); + pschema->set_table_id(2); + pschema->set_version(0); + + auto tuple_desc = desc_tbl->get_tuple_descriptor(0); + tuple_desc->to_protobuf(pschema->mutable_tuple_desc()); + for (auto slot : tuple_desc->slots()) { + slot->to_protobuf(pschema->add_slot_descs()); + } + + // index schema + auto indexes = pschema->add_indexes(); + indexes->set_id(4); + indexes->add_columns("c1"); + indexes->add_columns("c2"); + indexes->set_schema_hash(123); +} + +static void create_block(Schema& schema, vectorized::Block& block) +{ + for (auto &column_desc : schema.columns()) { + ASSERT_TRUE(column_desc); + auto data_type = Schema::get_data_type_ptr(column_desc->type()); + ASSERT_NE(data_type, nullptr); + if (column_desc->is_nullable()) { + data_type = std::make_shared(std::move(data_type)); + } + auto column = data_type->create_column(); + vectorized::ColumnWithTypeAndName ctn(std::move(column), data_type, column_desc->name()); + block.insert(ctn); + } +} + +TEST_F(VLoadChannelMgrTest, normal) { + ExecEnv env; + vectorized::VLoadChannelMgr mgr; + mgr.init(-1); + + auto schema = create_schema(); + auto tdesc_tbl = create_descriptor_table(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + auto tracker = std::make_shared(); + PUniqueId load_id; + load_id.set_hi(2); + load_id.set_lo(3); + { + PTabletWriterOpenRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_txn_id(1); + create_schema(desc_tbl, request.mutable_schema()); + for (int i = 0; i < 2; ++i) { + auto tablet = request.add_tablets(); + tablet->set_partition_id(10 + i); + tablet->set_tablet_id(20 + i); + } + request.set_num_senders(1); + request.set_need_gen_rollup(false); + auto st = mgr.open(request); + if (!st.ok()) { + LOG(INFO) << "here we go!!!!"; + LOG(INFO) << st.to_string() << std::endl; + } + request.release_id(); + ASSERT_TRUE(st.ok()); + } + + // add a block + { + PTabletWriterAddBlockRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_sender_id(0); + request.set_eos(true); + request.set_packet_seq(0); + + request.add_tablet_ids(20); + request.add_tablet_ids(21); + request.add_tablet_ids(20); + + vectorized::Block block; + create_block(schema, block); + + auto columns = block.mutate_columns(); + auto& col1 = columns[0]; + auto& col2 = columns[1]; + + // row1 + { + int value = 987654; + int64_t big_value = 1234567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row2 + { + int value = 12345678; + int64_t big_value = 9876567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row3 + { + int value = 876545678; + int64_t big_value = 76543234567; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + + PTabletWriterAddBlockResult response; + std::string buffer; + block.serialize(request.mutable_block(), &uncompressed_size, &compressed_size, &buffer); + auto st = mgr.add_block(request, &response); + if (!st.ok()) { + LOG(INFO) << "here we go!!!!"; + LOG(INFO) << st.to_string() << std::endl; + } + request.release_id(); + ASSERT_TRUE(st.ok()); + } + // check content + ASSERT_EQ(_k_tablet_recorder[20], 2); + ASSERT_EQ(_k_tablet_recorder[21], 1); +} + +TEST_F(VLoadChannelMgrTest, cancel) { + ExecEnv env; + vectorized::VLoadChannelMgr mgr; + mgr.init(-1); + + auto tdesc_tbl = create_descriptor_table(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + + PUniqueId load_id; + load_id.set_hi(2); + load_id.set_lo(3); + { + PTabletWriterOpenRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_txn_id(1); + create_schema(desc_tbl, request.mutable_schema()); + for (int i = 0; i < 2; ++i) { + auto tablet = request.add_tablets(); + tablet->set_partition_id(10 + i); + tablet->set_tablet_id(20 + i); + } + request.set_num_senders(1); + request.set_need_gen_rollup(false); + auto st = mgr.open(request); + request.release_id(); + ASSERT_TRUE(st.ok()); + } + + // add a batch + { + PTabletWriterCancelRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + auto st = mgr.cancel(request); + request.release_id(); + ASSERT_TRUE(st.ok()); + } +} + +TEST_F(VLoadChannelMgrTest, open_failed) { + ExecEnv env; + vectorized::VLoadChannelMgr mgr; + mgr.init(-1); + + auto tdesc_tbl = create_descriptor_table(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + + PUniqueId load_id; + load_id.set_hi(2); + load_id.set_lo(3); + { + PTabletWriterOpenRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_txn_id(1); + create_schema(desc_tbl, request.mutable_schema()); + for (int i = 0; i < 2; ++i) { + auto tablet = request.add_tablets(); + tablet->set_partition_id(10 + i); + tablet->set_tablet_id(20 + i); + } + request.set_num_senders(1); + request.set_need_gen_rollup(false); + open_status = OLAP_ERR_TABLE_NOT_FOUND; + auto st = mgr.open(request); + request.release_id(); + ASSERT_FALSE(st.ok()); + } +} + +TEST_F(VLoadChannelMgrTest, add_failed) { + ExecEnv env; + vectorized::VLoadChannelMgr mgr; + mgr.init(-1); + + auto schema = create_schema(); + auto tdesc_tbl = create_descriptor_table(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + auto tracker = std::make_shared(); + PUniqueId load_id; + load_id.set_hi(2); + load_id.set_lo(3); + { + PTabletWriterOpenRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_txn_id(1); + create_schema(desc_tbl, request.mutable_schema()); + for (int i = 0; i < 2; ++i) { + auto tablet = request.add_tablets(); + tablet->set_partition_id(10 + i); + tablet->set_tablet_id(20 + i); + } + request.set_num_senders(1); + request.set_need_gen_rollup(false); + auto st = mgr.open(request); + request.release_id(); + ASSERT_TRUE(st.ok()); + } + + // add a batch + { + PTabletWriterAddBlockRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_sender_id(0); + request.set_eos(true); + request.set_packet_seq(0); + + request.add_tablet_ids(20); + request.add_tablet_ids(21); + request.add_tablet_ids(20); + + vectorized::Block block; + create_block(schema, block); + + auto columns = block.mutate_columns(); + auto& col1 = columns[0]; + auto& col2 = columns[1]; + + // row1 + { + int value = 987654; + int64_t big_value = 1234567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row2 + { + int value = 12345678; + int64_t big_value = 9876567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row3 + { + int value = 876545678; + int64_t big_value = 76543234567; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + + std::string buffer; + block.serialize(request.mutable_block(), &uncompressed_size, &compressed_size, &buffer); + // DeltaWriter's write will return -215 + add_status = OLAP_ERR_TABLE_NOT_FOUND; + PTabletWriterAddBlockResult response; + auto st = mgr.add_block(request, &response); + request.release_id(); + // st is still ok. + ASSERT_TRUE(st.ok()); + ASSERT_EQ(2, response.tablet_errors().size()); + } +} + + +TEST_F(VLoadChannelMgrTest, close_failed) { + ExecEnv env; + vectorized::VLoadChannelMgr mgr; + mgr.init(-1); + + auto schema = create_schema(); + auto tdesc_tbl = create_descriptor_table(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + auto tracker = std::make_shared(); + PUniqueId load_id; + load_id.set_hi(2); + load_id.set_lo(3); + { + PTabletWriterOpenRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_txn_id(1); + create_schema(desc_tbl, request.mutable_schema()); + for (int i = 0; i < 2; ++i) { + auto tablet = request.add_tablets(); + tablet->set_partition_id(10 + i); + tablet->set_tablet_id(20 + i); + } + request.set_num_senders(1); + request.set_need_gen_rollup(false); + auto st = mgr.open(request); + request.release_id(); + ASSERT_TRUE(st.ok()); + } + + // add a batch + { + PTabletWriterAddBlockRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_sender_id(0); + request.set_eos(true); + request.set_packet_seq(0); + + request.add_tablet_ids(20); + request.add_tablet_ids(21); + request.add_tablet_ids(20); + + request.add_partition_ids(10); + request.add_partition_ids(11); + + vectorized::Block block; + create_block(schema, block); + + auto columns = block.mutate_columns(); + auto& col1 = columns[0]; + auto& col2 = columns[1]; + + // row1 + { + int value = 987654; + int64_t big_value = 1234567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row2 + { + int value = 12345678; + int64_t big_value = 9876567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row3 + { + int value = 876545678; + int64_t big_value = 76543234567; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + + std::string buffer; + block.serialize(request.mutable_block(), &uncompressed_size, &compressed_size, &buffer); + close_status = OLAP_ERR_TABLE_NOT_FOUND; + PTabletWriterAddBlockResult response; + auto st = mgr.add_block(request, &response); + request.release_id(); + // even if delta close failed, the return status is still ok, but tablet_vec is empty + ASSERT_TRUE(st.ok()); + ASSERT_TRUE(response.tablet_vec().empty()); + } +} + +TEST_F(VLoadChannelMgrTest, unknown_tablet) { + ExecEnv env; + vectorized::VLoadChannelMgr mgr; + mgr.init(-1); + + auto schema = create_schema(); + auto tdesc_tbl = create_descriptor_table(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + auto tracker = std::make_shared(); + PUniqueId load_id; + load_id.set_hi(2); + load_id.set_lo(3); + { + PTabletWriterOpenRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_txn_id(1); + create_schema(desc_tbl, request.mutable_schema()); + for (int i = 0; i < 2; ++i) { + auto tablet = request.add_tablets(); + tablet->set_partition_id(10 + i); + tablet->set_tablet_id(20 + i); + } + request.set_num_senders(1); + request.set_need_gen_rollup(false); + auto st = mgr.open(request); + request.release_id(); + ASSERT_TRUE(st.ok()); + } + + // add a batch + { + PTabletWriterAddBlockRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_sender_id(0); + request.set_eos(true); + request.set_packet_seq(0); + + request.add_tablet_ids(20); + request.add_tablet_ids(22); + request.add_tablet_ids(20); + + vectorized::Block block; + create_block(schema, block); + + auto columns = block.mutate_columns(); + auto& col1 = columns[0]; + auto& col2 = columns[1]; + + // row1 + { + int value = 987654; + int64_t big_value = 1234567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row2 + { + int value = 12345678; + int64_t big_value = 9876567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row3 + { + int value = 876545678; + int64_t big_value = 76543234567; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + + std::string buffer; + block.serialize(request.mutable_block(), &uncompressed_size, &compressed_size, &buffer); + PTabletWriterAddBlockResult response; + auto st = mgr.add_block(request, &response); + request.release_id(); + ASSERT_FALSE(st.ok()); + } +} + +TEST_F(VLoadChannelMgrTest, duplicate_packet) { + ExecEnv env; + vectorized::VLoadChannelMgr mgr; + mgr.init(-1); + + auto schema = create_schema(); + auto tdesc_tbl = create_descriptor_table(); + ObjectPool obj_pool; + DescriptorTbl* desc_tbl = nullptr; + DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl); + RowDescriptor row_desc(*desc_tbl, {0}, {false}); + auto tracker = std::make_shared(); + PUniqueId load_id; + load_id.set_hi(2); + load_id.set_lo(3); + { + PTabletWriterOpenRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_txn_id(1); + create_schema(desc_tbl, request.mutable_schema()); + for (int i = 0; i < 2; ++i) { + auto tablet = request.add_tablets(); + tablet->set_partition_id(10 + i); + tablet->set_tablet_id(20 + i); + } + request.set_num_senders(1); + request.set_need_gen_rollup(false); + auto st = mgr.open(request); + request.release_id(); + ASSERT_TRUE(st.ok()); + } + + // add a batch + { + PTabletWriterAddBlockRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_sender_id(0); + request.set_eos(false); + request.set_packet_seq(0); + + request.add_tablet_ids(20); + request.add_tablet_ids(21); + request.add_tablet_ids(20); + + vectorized::Block block; + create_block(schema, block); + + auto columns = block.mutate_columns(); + auto& col1 = columns[0]; + auto& col2 = columns[1]; + + // row1 + { + int value = 987654; + int64_t big_value = 1234567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row2 + { + int value = 12345678; + int64_t big_value = 9876567899876; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + // row3 + { + int value = 876545678; + int64_t big_value = 76543234567; + col1->insert_data((const char*)&value, sizeof(value)); + col2->insert_data((const char*)&big_value, sizeof(big_value)); + } + + std::string buffer; + block.serialize(request.mutable_block(), &uncompressed_size, &compressed_size, &buffer); + PTabletWriterAddBlockResult response; + auto st = mgr.add_block(request, &response); + ASSERT_TRUE(st.ok()); + PTabletWriterAddBlockResult response2; + st = mgr.add_block(request, &response2); + request.release_id(); + ASSERT_TRUE(st.ok()); + } + // close + { + PTabletWriterAddBlockRequest request; + request.set_allocated_id(&load_id); + request.set_index_id(4); + request.set_sender_id(0); + request.set_eos(true); + request.set_packet_seq(0); + PTabletWriterAddBlockResult response; + auto st = mgr.add_block(request, &response); + request.release_id(); + ASSERT_TRUE(st.ok()); + } + // check content + ASSERT_EQ(_k_tablet_recorder[20], 2); + ASSERT_EQ(_k_tablet_recorder[21], 1); +} + +} // namespace doris + +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + doris::CpuInfo::init(); + return RUN_ALL_TESTS(); +} From 119e602f9aa9639a4571cf35f2b7de0bb05cd021 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Wed, 16 Mar 2022 10:21:08 +0800 Subject: [PATCH 17/32] agg by vectorized method --- be/src/olap/memtable.cpp | 93 ++++++++++++++++++++++++++++++++++------ be/src/olap/memtable.h | 5 +++ 2 files changed, 86 insertions(+), 12 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 702ea5ed902897..2abd0fd307cc33 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -26,6 +26,8 @@ #include "runtime/tuple.h" #include "util/debug_util.h" #include "util/doris_metrics.h" +#include "vec/core/field.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" namespace doris { MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, @@ -42,12 +44,14 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _table_mem_pool(new MemPool(_mem_tracker.get())), _schema_size(_schema->schema_size()), _rowset_writer(rowset_writer), - _is_first_insertion(true) { + _is_first_insertion(true), + _agg_functions(schema->num_columns()){ if (support_vec){ _skip_list = nullptr; _vec_row_comparator = std::make_shared(_schema); _vec_skip_list = new VecTable(_vec_row_comparator.get(), _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS); + _init_agg_functions(); }else{ _vec_skip_list =nullptr; if (tablet_schema->sort_type() == SortType::ZORDER) { @@ -61,6 +65,34 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet } } +void MemTable::_init_agg_functions() +{ + + for (uint32_t cid = _schema->num_key_columns(); + cid < _schema->num_columns(); + ++cid) { + FieldAggregationMethod agg_method = + _tablet_schema + ->column(cid) + .aggregation(); + std::string agg_name = + TabletColumn::get_string_by_aggregation_type(agg_method) + "_reader"; + std::transform(agg_name.begin(), agg_name.end(), agg_name.begin(), + [](unsigned char c) { return std::tolower(c); }); + + // create aggregate function + vectorized::DataTypes argument_types; + vectorized::DataTypePtr dtptr = Schema::get_data_type_ptr(_schema->column(cid)->type()); + argument_types.push_back(dtptr); + vectorized::Array params; + vectorized::AggregateFunctionPtr func = vectorized::AggregateFunctionSimpleFactory::instance().get( + agg_name, argument_types, params, + dtptr->is_nullable()); + + DCHECK(func != nullptr); + _agg_functions[cid] = func; + } +} MemTable::~MemTable() { if (_skip_list) delete _skip_list; @@ -181,24 +213,61 @@ void MemTable::_aggregate_two_row(const ContiguousRow& src_row, TableKey row_in_ } void MemTable::_aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_skiplist){ - if (_tablet_schema->has_sequence_col()) { + if (_tablet_schema->has_sequence_col()) + { auto sequence_idx = _tablet_schema->sequence_col_idx(); - auto seq_dst_cell = row_in_skiplist.cell(&_input_mutable_block, sequence_idx); - auto seq_src_cell = new_row.cell(&_input_mutable_block, sequence_idx); - auto res = _schema->column(sequence_idx)->compare_cell(seq_dst_cell, seq_src_cell); + auto res = _input_mutable_block.compare_at(row_in_skiplist._row_pos, new_row._row_pos, sequence_idx, _input_mutable_block, -1); // dst sequence column larger than src, don't need to update - if (res > 0) { - return; + if (res > 0){ + return ; } - } + } //dst is non-sequence row, or dst sequence is smaller for (uint32_t cid = _schema->num_key_columns(); cid < _schema->num_columns(); - ++cid) { - auto dst_cell = row_in_skiplist.cell(&_input_mutable_block, cid); - auto src_cell = new_row.cell(&_input_mutable_block, cid); - _schema->column(cid)->agg_update(&dst_cell, &src_cell, _table_mem_pool.get()); + ++cid) + { + vectorized::AggregateDataPtr place = const_cast( + _input_mutable_block.mutable_columns()[cid] + ->get_data_at(row_in_skiplist._row_pos).data); + + auto colptr = _input_mutable_block.mutable_columns()[cid].get(); + _agg_functions[cid]->add(place, + //static_cast(&_input_mutable_block.mutable_columns()[cid]), + const_cast( &colptr), + new_row._row_pos, + nullptr + ); } + + + // StringRef ref = block->mutable_columns()[cid]->get_data_at(_row_pos); + // bool is_null = block->mutable_columns()[cid]->is_null_at(_row_pos); + // NullState null_state = is_null ? NullState::IS_NULL : NullState::NOT_NULL; + // return RowCursorCell(ref.data, null_state); + + // auto dst_cell = row_in_skiplist.cell(&_input_mutable_block, cid); + // auto src_cell = new_row.cell(&_input_mutable_block, cid); + // _schema->column(cid)->agg_update(&dst_cell, &src_cell, _table_mem_pool.get()); + + // if (_tablet_schema->has_sequence_col()) { + // auto sequence_idx = _tablet_schema->sequence_col_idx(); + // auto seq_dst_cell = row_in_skiplist.cell(&_input_mutable_block, sequence_idx); + // auto seq_src_cell = new_row.cell(&_input_mutable_block, sequence_idx); + // auto res = _schema->column(sequence_idx)->compare_cell(seq_dst_cell, seq_src_cell); + // // dst sequence column larger than src, don't need to update + // if (res > 0) { + // return; + // } + // } + // //dst is non-sequence row, or dst sequence is smaller + // for (uint32_t cid = _schema->num_key_columns(); + // cid < _schema->num_columns(); + // ++cid) { + // auto dst_cell = row_in_skiplist.cell(&_input_mutable_block, cid); + // auto src_cell = new_row.cell(&_input_mutable_block, cid); + // _schema->column(cid)->agg_update(&dst_cell, &src_cell, _table_mem_pool.get()); + // } } vectorized::Block MemTable::collect_skiplist_results() diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 04e3f969843470..7e5e93a9678592 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -27,6 +27,7 @@ #include "util/tuple_row_zorder_compare.h" #include "vec/core/block.h" #include "vec/common/string_ref.h" +#include "vec/aggregate_functions/aggregate_function.h" namespace doris { struct ContiguousRow; @@ -174,6 +175,10 @@ class MemTable { vectorized::MutableBlock _output_mutable_block; vectorized::Block collect_skiplist_results(); bool _is_first_insertion; + + void _init_agg_functions(); + std::vector _agg_functions; + }; // class MemTable From 436de968b2a11ebeeeab6d49e3053c72b9724cd5 Mon Sep 17 00:00:00 2001 From: jacktengg Date: Wed, 16 Mar 2022 14:22:54 +0800 Subject: [PATCH 18/32] vbroker scanner handle date and datetime correctly --- be/src/vec/exec/vbroker_scanner.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/be/src/vec/exec/vbroker_scanner.cpp b/be/src/vec/exec/vbroker_scanner.cpp index c52da490f10cce..5713a90a032a6a 100644 --- a/be/src/vec/exec/vbroker_scanner.cpp +++ b/be/src/vec/exec/vbroker_scanner.cpp @@ -270,16 +270,17 @@ Status VBrokerScanner::_fill_dest_columns(std::vector& columns break; } case TYPE_DATETIME: { - uint64_t value = *reinterpret_cast(value_ptr); - VecDateTimeValue data(value); + DateTimeValue value = *reinterpret_cast(value_ptr); + VecDateTimeValue date; + date.convert_dt_to_vec_dt(&value); assert_cast*>(column_ptr) - ->insert_data(reinterpret_cast(&data), 0); + ->insert_data(reinterpret_cast(&date), 0); break; } case TYPE_DATE: { - uint64_t value = *reinterpret_cast(value_ptr); + DateTimeValue value = *reinterpret_cast(value_ptr); VecDateTimeValue date; - date.from_olap_date(value); + date.convert_dt_to_vec_dt(&value); assert_cast*>(column_ptr) ->insert_data(reinterpret_cast(&date), 0); break; From fb37ca8602bc26d9d359d384194214614f76ccb0 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Wed, 16 Mar 2022 16:25:30 +0800 Subject: [PATCH 19/32] vectorized skiplist --- be/src/olap/memtable.cpp | 79 +++++++++++++++++++--------------------- be/src/olap/memtable.h | 14 ++++++- 2 files changed, 51 insertions(+), 42 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 2abd0fd307cc33..1da3265fd5c275 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -85,12 +85,12 @@ void MemTable::_init_agg_functions() vectorized::DataTypePtr dtptr = Schema::get_data_type_ptr(_schema->column(cid)->type()); argument_types.push_back(dtptr); vectorized::Array params; - vectorized::AggregateFunctionPtr func = vectorized::AggregateFunctionSimpleFactory::instance().get( + vectorized::AggregateFunctionPtr function = vectorized::AggregateFunctionSimpleFactory::instance().get( agg_name, argument_types, params, dtptr->is_nullable()); - DCHECK(func != nullptr); - _agg_functions[cid] = func; + DCHECK(function != nullptr); + _agg_functions[cid] = function; } } MemTable::~MemTable() { @@ -149,6 +149,17 @@ void MemTable::insert_one_row_from_block(struct RowInBlock row_in_block) if (is_exist){ _aggregate_two_rowInBlock(row_in_block, _vec_hint.curr->key); }else{ + row_in_block.init_agg_places(_agg_functions, _schema->num_key_columns()); + for ( auto cid = _schema->num_key_columns(); cid < _schema->num_columns(); cid++){ + auto col_ptr = _input_mutable_block.mutable_columns()[cid].get(); + auto place = row_in_block._agg_places[cid]; + _agg_functions[cid]->add(place, + const_cast( &col_ptr), + row_in_block._row_pos, + nullptr + ); + } + _vec_skip_list->InsertWithHint(row_in_block, is_exist, &_vec_hint); } } @@ -227,55 +238,41 @@ void MemTable::_aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_s cid < _schema->num_columns(); ++cid) { - vectorized::AggregateDataPtr place = const_cast( - _input_mutable_block.mutable_columns()[cid] - ->get_data_at(row_in_skiplist._row_pos).data); + auto place = row_in_skiplist._agg_places[cid]; + + auto col_ptr = _input_mutable_block.mutable_columns()[cid].get(); - auto colptr = _input_mutable_block.mutable_columns()[cid].get(); _agg_functions[cid]->add(place, - //static_cast(&_input_mutable_block.mutable_columns()[cid]), - const_cast( &colptr), + const_cast( &col_ptr), new_row._row_pos, nullptr ); - } - - - // StringRef ref = block->mutable_columns()[cid]->get_data_at(_row_pos); - // bool is_null = block->mutable_columns()[cid]->is_null_at(_row_pos); - // NullState null_state = is_null ? NullState::IS_NULL : NullState::NOT_NULL; - // return RowCursorCell(ref.data, null_state); - - // auto dst_cell = row_in_skiplist.cell(&_input_mutable_block, cid); - // auto src_cell = new_row.cell(&_input_mutable_block, cid); - // _schema->column(cid)->agg_update(&dst_cell, &src_cell, _table_mem_pool.get()); - - // if (_tablet_schema->has_sequence_col()) { - // auto sequence_idx = _tablet_schema->sequence_col_idx(); - // auto seq_dst_cell = row_in_skiplist.cell(&_input_mutable_block, sequence_idx); - // auto seq_src_cell = new_row.cell(&_input_mutable_block, sequence_idx); - // auto res = _schema->column(sequence_idx)->compare_cell(seq_dst_cell, seq_src_cell); - // // dst sequence column larger than src, don't need to update - // if (res > 0) { - // return; - // } - // } - // //dst is non-sequence row, or dst sequence is smaller - // for (uint32_t cid = _schema->num_key_columns(); - // cid < _schema->num_columns(); - // ++cid) { - // auto dst_cell = row_in_skiplist.cell(&_input_mutable_block, cid); - // auto src_cell = new_row.cell(&_input_mutable_block, cid); - // _schema->column(cid)->agg_update(&dst_cell, &src_cell, _table_mem_pool.get()); - // } + } } vectorized::Block MemTable::collect_skiplist_results() { VecTable::Iterator it(_vec_skip_list); vectorized::Block in_block = _input_mutable_block.to_block(); - for (it.SeekToFirst(); it.Valid(); it.Next()) { - _output_mutable_block.add_row(&in_block, it.key()._row_pos); + if (_keys_type == KeysType::DUP_KEYS){ + for (it.SeekToFirst(); it.Valid(); it.Next()) { + _output_mutable_block.add_row(&in_block, it.key()._row_pos); + } + }else{ + for (it.SeekToFirst(); it.Valid(); it.Next()) { + + auto& block_data = in_block.get_columns_with_type_and_name(); + //move key columns + for (size_t i = 0; i < _schema->num_key_columns(); ++i) { + _output_mutable_block.get_column_by_position(i)->insert_from(*block_data[i].column.get(), it.key()._row_pos); + } + //get value columns from agg_places + + for (size_t i = _schema->num_key_columns(); i < _schema->num_columns(); ++i) { + auto function = _agg_functions[i]; + function->insert_result_into(it.key()._agg_places[i] , *(_output_mutable_block.get_column_by_position(i))); + } + } } return _output_mutable_block.to_block(); } diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 7e5e93a9678592..f24e2d08464ba8 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -77,7 +77,19 @@ class MemTable { //row pos in _input_mutable_block struct RowInBlock{ size_t _row_pos; - RowInBlock(size_t i):_row_pos(i){} + std::vector _agg_places; + RowInBlock(size_t i):_row_pos(i) {} + void init_agg_places(std::vector& agg_functions, + int key_column_count){ + _agg_places.resize(agg_functions.size()); + for(int cid = key_column_count; cid < agg_functions.size(); cid++) + { + auto function = agg_functions[cid]; + _agg_places[cid] = new char[function->size_of_data()]; + function->create( _agg_places[cid] ); + } + } + RowCursorCell cell(vectorized::MutableBlock* block, int cid){ StringRef ref = block->mutable_columns()[cid]->get_data_at(_row_pos); bool is_null = block->mutable_columns()[cid]->is_null_at(_row_pos); From 6a8a82340171d01e0fbc1ca8e713ec4104c6ec66 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Wed, 16 Mar 2022 17:38:04 +0800 Subject: [PATCH 20/32] skip init_agg for dup_keys --- be/src/olap/memtable.cpp | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 1da3265fd5c275..283e40498f7f20 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -51,7 +51,9 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _vec_row_comparator = std::make_shared(_schema); _vec_skip_list = new VecTable(_vec_row_comparator.get(), _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS); - _init_agg_functions(); + if (_keys_type != KeysType::DUP_KEYS){ + _init_agg_functions(); + } }else{ _vec_skip_list =nullptr; if (tablet_schema->sort_type() == SortType::ZORDER) { @@ -277,6 +279,25 @@ vectorized::Block MemTable::collect_skiplist_results() return _output_mutable_block.to_block(); } +void dump(const vectorized::Block& block, int64_t tablet_id) { + std::ofstream out; + std::string file_name("/home/englefly/stream_load_test/dump.txt"); + file_name += std::to_string(tablet_id); + out.open(file_name); + for (size_t row_num = 0; row_num < block.rows(); ++row_num) { + for (size_t i = 0; i < block.columns(); ++i) { + if (block.get_by_position(i).column) { + out << block.get_by_position(i).to_string(row_num); + } + if (i != block.columns() - 1) { + out << ", "; + } + } + out << "\n"; + } + out.close(); +} + OLAPStatus MemTable::_vflush(){ VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id << ", memsize: " << memory_usage() << ", rows: " << _rows; @@ -285,6 +306,7 @@ OLAPStatus MemTable::_vflush(){ { SCOPED_RAW_TIMER(&duration_ns); vectorized::Block block = collect_skiplist_results(); + dump(block, _tablet_id); OLAPStatus st = _rowset_writer->add_block(&block); RETURN_NOT_OK(st); _flush_size = block.allocated_bytes(); From 04dd19fbd08eeb9efd2831a4516da098bc903a85 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Thu, 17 Mar 2022 14:08:45 +0800 Subject: [PATCH 21/32] =?UTF-8?q?=E4=BD=BF=E7=94=A8=20vec=20agg?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- be/src/olap/memtable.cpp | 10 +++++----- be/src/olap/memtable.h | 10 ++++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 283e40498f7f20..5344fe8132295e 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -51,9 +51,6 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _vec_row_comparator = std::make_shared(_schema); _vec_skip_list = new VecTable(_vec_row_comparator.get(), _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS); - if (_keys_type != KeysType::DUP_KEYS){ - _init_agg_functions(); - } }else{ _vec_skip_list =nullptr; if (tablet_schema->sort_type() == SortType::ZORDER) { @@ -67,7 +64,7 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet } } -void MemTable::_init_agg_functions() +void MemTable::_init_agg_functions(const vectorized::Block* block) { for (uint32_t cid = _schema->num_key_columns(); @@ -84,7 +81,7 @@ void MemTable::_init_agg_functions() // create aggregate function vectorized::DataTypes argument_types; - vectorized::DataTypePtr dtptr = Schema::get_data_type_ptr(_schema->column(cid)->type()); + vectorized::DataTypePtr dtptr = block->get_data_type(cid);//Schema::get_data_type_ptr(_schema->column(cid)->type()); argument_types.push_back(dtptr); vectorized::Array params; vectorized::AggregateFunctionPtr function = vectorized::AggregateFunctionSimpleFactory::instance().get( @@ -125,6 +122,9 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num _input_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); _vec_row_comparator->set_block(&_input_mutable_block); _output_mutable_block = vectorized::MutableBlock::build_mutable_block(&cloneBlock); + if (_keys_type != KeysType::DUP_KEYS){ + _init_agg_functions(block); + } } size_t cursor_in_mutableblock = _input_mutable_block.rows(); size_t oldsize = block->allocated_bytes(); diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index f24e2d08464ba8..fbf8015b60aee1 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -85,7 +85,8 @@ class MemTable { for(int cid = key_column_count; cid < agg_functions.size(); cid++) { auto function = agg_functions[cid]; - _agg_places[cid] = new char[function->size_of_data()]; + size_t place_size = function->size_of_data(); + _agg_places[cid] = new char[place_size]; function->create( _agg_places[cid] ); } } @@ -96,6 +97,11 @@ class MemTable { NullState null_state = is_null ? NullState::IS_NULL : NullState::NOT_NULL; return RowCursorCell(ref.data, null_state); } + ~ RowInBlock(){ + for( auto place: _agg_places){ + delete place; + } + } }; class RowInBlockComparator { public: @@ -188,7 +194,7 @@ class MemTable { vectorized::Block collect_skiplist_results(); bool _is_first_insertion; - void _init_agg_functions(); + void _init_agg_functions(const vectorized::Block* block); std::vector _agg_functions; }; // class MemTable From f6bfc5405cda4cce35b4a25387608c8f0d1461d4 Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Thu, 17 Mar 2022 20:46:14 +0800 Subject: [PATCH 22/32] =?UTF-8?q?RowInBlock=20=E4=BD=BF=E7=94=A8=E6=8C=87?= =?UTF-8?q?=E9=92=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- be/src/olap/memtable.cpp | 49 +++++++++++++++---------- be/src/olap/memtable.h | 24 +++++++----- be/src/olap/rowset/rowset_writer.h | 2 - be/src/vec/exec/vbroker_scan_node.cpp | 2 +- be/src/vec/runtime/vtablets_channel.cpp | 1 - be/src/vec/sink/vtablet_sink.cpp | 2 +- 6 files changed, 45 insertions(+), 35 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 5344fe8132295e..95bfc75b7cd58b 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -97,6 +97,12 @@ MemTable::~MemTable() { delete _skip_list; if (_vec_skip_list) delete _vec_skip_list; + for(auto row: rowInBlocks) + { + if (row != nullptr){ + delete row; + } + } } MemTable::RowCursorComparator::RowCursorComparator(const Schema* schema) : _schema(schema) {} @@ -107,8 +113,8 @@ int MemTable::RowCursorComparator::operator()(const char* left, const char* righ return compare_row(lhs_row, rhs_row); } -int MemTable::RowInBlockComparator::operator()(const RowInBlock left, const RowInBlock right) const{ - return _pblock->compare_at(left._row_pos, right._row_pos, +int MemTable::RowInBlockComparator::operator()(const RowInBlock* left, const RowInBlock* right) const{ + return _pblock->compare_at(left->_row_pos, right->_row_pos, _schema->num_key_columns(), *_pblock, -1); } @@ -130,39 +136,41 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num size_t oldsize = block->allocated_bytes(); _input_mutable_block.add_rows(block, row_pos, num_rows); size_t newsize = block->allocated_bytes(); - _mem_tracker->Consume(newsize - oldsize); + _mem_tracker->consume(newsize - oldsize); for(int i = 0; i < num_rows; i++){ - insert_one_row_from_block(RowInBlock(cursor_in_mutableblock + i)); + RowInBlock* row_in_block_ptr = new RowInBlock(cursor_in_mutableblock + i); + rowInBlocks.push_back(row_in_block_ptr); + insert_one_row_from_block(row_in_block_ptr); } } -void MemTable::insert_one_row_from_block(struct RowInBlock row_in_block) +void MemTable::insert_one_row_from_block(RowInBlock* row_in_block_ptr) { _rows++; bool overwritten = false; if (_keys_type == KeysType::DUP_KEYS) { - _vec_skip_list->Insert(row_in_block, &overwritten); + _vec_skip_list->Insert(row_in_block_ptr, &overwritten); DCHECK(!overwritten) << "Duplicate key model meet overwrite in SkipList"; return; } - bool is_exist = _vec_skip_list->Find(row_in_block, &_vec_hint); + bool is_exist = _vec_skip_list->Find(row_in_block_ptr, &_vec_hint); if (is_exist){ - _aggregate_two_rowInBlock(row_in_block, _vec_hint.curr->key); + _aggregate_two_rowInBlock(row_in_block_ptr, _vec_hint.curr->key); }else{ - row_in_block.init_agg_places(_agg_functions, _schema->num_key_columns()); + row_in_block_ptr->init_agg_places(_agg_functions, _schema->num_key_columns()); for ( auto cid = _schema->num_key_columns(); cid < _schema->num_columns(); cid++){ auto col_ptr = _input_mutable_block.mutable_columns()[cid].get(); - auto place = row_in_block._agg_places[cid]; + auto place = row_in_block_ptr->_agg_places[cid]; _agg_functions[cid]->add(place, const_cast( &col_ptr), - row_in_block._row_pos, + row_in_block_ptr->_row_pos, nullptr ); } - _vec_skip_list->InsertWithHint(row_in_block, is_exist, &_vec_hint); + _vec_skip_list->InsertWithHint(row_in_block_ptr, is_exist, &_vec_hint); } } @@ -225,11 +233,11 @@ void MemTable::_aggregate_two_row(const ContiguousRow& src_row, TableKey row_in_ } } -void MemTable::_aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_skiplist){ +void MemTable::_aggregate_two_rowInBlock(RowInBlock* new_row, RowInBlock* row_in_skiplist){ if (_tablet_schema->has_sequence_col()) { auto sequence_idx = _tablet_schema->sequence_col_idx(); - auto res = _input_mutable_block.compare_at(row_in_skiplist._row_pos, new_row._row_pos, sequence_idx, _input_mutable_block, -1); + auto res = _input_mutable_block.compare_at(row_in_skiplist->_row_pos, new_row->_row_pos, sequence_idx, _input_mutable_block, -1); // dst sequence column larger than src, don't need to update if (res > 0){ return ; @@ -240,13 +248,13 @@ void MemTable::_aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_s cid < _schema->num_columns(); ++cid) { - auto place = row_in_skiplist._agg_places[cid]; + auto place = row_in_skiplist->_agg_places[cid]; auto col_ptr = _input_mutable_block.mutable_columns()[cid].get(); _agg_functions[cid]->add(place, const_cast( &col_ptr), - new_row._row_pos, + new_row->_row_pos, nullptr ); } @@ -258,7 +266,7 @@ vectorized::Block MemTable::collect_skiplist_results() vectorized::Block in_block = _input_mutable_block.to_block(); if (_keys_type == KeysType::DUP_KEYS){ for (it.SeekToFirst(); it.Valid(); it.Next()) { - _output_mutable_block.add_row(&in_block, it.key()._row_pos); + _output_mutable_block.add_row(&in_block, it.key()->_row_pos); } }else{ for (it.SeekToFirst(); it.Valid(); it.Next()) { @@ -266,13 +274,14 @@ vectorized::Block MemTable::collect_skiplist_results() auto& block_data = in_block.get_columns_with_type_and_name(); //move key columns for (size_t i = 0; i < _schema->num_key_columns(); ++i) { - _output_mutable_block.get_column_by_position(i)->insert_from(*block_data[i].column.get(), it.key()._row_pos); + _output_mutable_block.get_column_by_position(i)->insert_from(*block_data[i].column.get(), it.key()->_row_pos); } //get value columns from agg_places for (size_t i = _schema->num_key_columns(); i < _schema->num_columns(); ++i) { auto function = _agg_functions[i]; - function->insert_result_into(it.key()._agg_places[i] , *(_output_mutable_block.get_column_by_position(i))); + function->insert_result_into(it.key()->_agg_places[i] , *(_output_mutable_block.get_column_by_position(i))); + function->destroy(it.key()->_agg_places[i]); } } } @@ -306,10 +315,10 @@ OLAPStatus MemTable::_vflush(){ { SCOPED_RAW_TIMER(&duration_ns); vectorized::Block block = collect_skiplist_results(); - dump(block, _tablet_id); OLAPStatus st = _rowset_writer->add_block(&block); RETURN_NOT_OK(st); _flush_size = block.allocated_bytes(); + _rowset_writer->flush(); } DorisMetrics::instance()->memtable_flush_total->increment(1); DorisMetrics::instance()->memtable_flush_duration_us->increment(duration_ns / 1000); diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index fbf8015b60aee1..e48bf58314cde0 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -82,12 +82,16 @@ class MemTable { void init_agg_places(std::vector& agg_functions, int key_column_count){ _agg_places.resize(agg_functions.size()); - for(int cid = key_column_count; cid < agg_functions.size(); cid++) + for(int cid = 0; cid < agg_functions.size(); cid++) { - auto function = agg_functions[cid]; - size_t place_size = function->size_of_data(); - _agg_places[cid] = new char[place_size]; - function->create( _agg_places[cid] ); + if (cid < key_column_count) { + _agg_places[cid] = nullptr; + }else{ + auto function = agg_functions[cid]; + size_t place_size = function->size_of_data(); + _agg_places[cid] = new char[place_size]; + function->create(_agg_places[cid]); + } } } @@ -109,7 +113,7 @@ class MemTable { //call set_block before operator(). //在第一次insert block时创建的 _input_mutable_block, 所以无法在Comparator的构造函数中获得pblock void set_block(vectorized::MutableBlock* pblock){_pblock = pblock;} - int operator()(const RowInBlock left, const RowInBlock right) const; + int operator()(const RowInBlock* left, const RowInBlock* right) const; private: const Schema* _schema; vectorized::MutableBlock* _pblock;// 对应Memtable::_input_mutable_block @@ -119,7 +123,7 @@ class MemTable { typedef SkipList Table; typedef Table::key_type TableKey; - typedef SkipList VecTable; + typedef SkipList VecTable; public: /// The iterator of memtable, so that the data in this memtable @@ -144,8 +148,8 @@ class MemTable { void _tuple_to_row(const Tuple* tuple, ContiguousRow* row, MemPool* mem_pool); void _aggregate_two_row(const ContiguousRow& new_row, TableKey row_in_skiplist); //for vectorized - void insert_one_row_from_block(struct RowInBlock row_in_block); - void _aggregate_two_rowInBlock(RowInBlock new_row, RowInBlock row_in_skiplist); + void insert_one_row_from_block(RowInBlock* row_in_block); + void _aggregate_two_rowInBlock(RowInBlock* new_row, RowInBlock* row_in_skiplist); int64_t _tablet_id; Schema* _schema; @@ -196,7 +200,7 @@ class MemTable { void _init_agg_functions(const vectorized::Block* block); std::vector _agg_functions; - + std::vector rowInBlocks; }; // class MemTable diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index fcbbdd0e2d18a6..fdbc2d7bf3aafe 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -58,8 +58,6 @@ class RowsetWriter { return OLAP_ERR_FUNC_NOT_IMPLEMENTED; } - virtual OLAPStatus add_block(const vectorized::Block* block) { - return OLAP_ERR_FUNC_NOT_IMPLEMENTED; } // finish building and return pointer to the built rowset (guaranteed to be inited). // return nullptr when failed virtual RowsetSharedPtr build() = 0; diff --git a/be/src/vec/exec/vbroker_scan_node.cpp b/be/src/vec/exec/vbroker_scan_node.cpp index bc20bf05e8c78a..9cb919228d0043 100644 --- a/be/src/vec/exec/vbroker_scan_node.cpp +++ b/be/src/vec/exec/vbroker_scan_node.cpp @@ -168,7 +168,7 @@ Status VBrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, // 1. too many batches in queue, or // 2. at least one batch in queue and memory exceed limit. (_block_queue.size() >= _max_buffered_batches || - (mem_tracker()->AnyLimitExceeded(MemLimit::HARD) && !_block_queue.empty()))) { + (mem_tracker()->any_limit_exceeded() && !_block_queue.empty()))) { _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); } // Process already set failed, so we just return OK diff --git a/be/src/vec/runtime/vtablets_channel.cpp b/be/src/vec/runtime/vtablets_channel.cpp index 1840b0d2be18d7..228f8f6f55f96b 100644 --- a/be/src/vec/runtime/vtablets_channel.cpp +++ b/be/src/vec/runtime/vtablets_channel.cpp @@ -56,7 +56,6 @@ Status VTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& reques wrequest.txn_id = _txn_id; wrequest.partition_id = tablet.partition_id(); wrequest.load_id = request.id(); - wrequest.need_gen_rollup = request.need_gen_rollup(); wrequest.slots = index_slots; wrequest.is_high_priority = _is_high_priority; diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index a3ace9da8abdf8..b27d76a2523f5f 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -137,7 +137,7 @@ Status VNodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && + while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); From e795b10495c4f485b1589434097d94f663be5796 Mon Sep 17 00:00:00 2001 From: starocean999 <12095047@qq.com> Date: Fri, 11 Mar 2022 10:50:13 +0800 Subject: [PATCH 23/32] add vec compaction --- be/src/olap/compaction.cpp | 2 +- be/src/olap/merger.cpp | 49 ++ be/src/olap/merger.h | 4 + be/src/olap/reader.cpp | 11 + be/src/olap/rowset/beta_rowset_writer.cpp | 64 ++ be/src/olap/rowset/beta_rowset_writer.h | 2 + be/src/olap/rowset/rowset_writer.h | 5 + .../olap/rowset/segment_v2/column_writer.cpp | 21 + be/src/olap/rowset/segment_v2/column_writer.h | 6 + .../olap/rowset/segment_v2/segment_writer.cpp | 81 +- .../olap/rowset/segment_v2/segment_writer.h | 13 + be/src/vec/CMakeLists.txt | 1 + be/src/vec/olap/olap_data_convertor.cpp | 720 ++++++++++++++++++ be/src/vec/olap/olap_data_convertor.h | 256 +++++++ be/src/vec/olap/vcollect_iterator.cpp | 46 +- be/src/vec/olap/vcollect_iterator.h | 5 + 16 files changed, 1282 insertions(+), 4 deletions(-) create mode 100644 be/src/vec/olap/olap_data_convertor.cpp create mode 100644 be/src/vec/olap/olap_data_convertor.h diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index bb486d5e49ba1c..b4e365d38eaf90 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -87,7 +87,7 @@ OLAPStatus Compaction::do_compaction_impl(int64_t permits) { // 2. write merged rows to output rowset // The test results show that merger is low-memory-footprint, there is no need to tracker its mem pool Merger::Statistics stats; - auto res = Merger::merge_rowsets(_tablet, compaction_type(), _input_rs_readers, + auto res = Merger::vmerge_rowsets(_tablet, compaction_type(), _input_rs_readers, _output_rs_writer.get(), &stats); if (res != OLAP_SUCCESS) { LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index 66dfaaf5974a79..67019334b7b1a5 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -22,6 +22,7 @@ #include "olap/olap_define.h" #include "olap/tuple_reader.h" +#include "vec/olap/block_reader.h" #include "olap/row_cursor.h" #include "olap/tablet.h" #include "util/trace.h" @@ -87,4 +88,52 @@ OLAPStatus Merger::merge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, return OLAP_SUCCESS; } +OLAPStatus Merger::vmerge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, + const std::vector& src_rowset_readers, + RowsetWriter* dst_rowset_writer, Statistics* stats_output) { + TRACE_COUNTER_SCOPE_LATENCY_US("merge_rowsets_latency_us"); + + vectorized::BlockReader reader; + TabletReader::ReaderParams reader_params; + reader_params.tablet = tablet; + reader_params.reader_type = reader_type; + reader_params.rs_readers = src_rowset_readers; + reader_params.version = dst_rowset_writer->version(); + + const auto& schema = tablet->tablet_schema(); + reader_params.return_columns.resize(schema.num_columns()); + std::iota(reader_params.return_columns.begin(), reader_params.return_columns.end(), 0); + reader_params.origin_return_columns = &reader_params.return_columns; + RETURN_NOT_OK(reader.init(reader_params)); + + vectorized::Block block = schema.create_block(reader_params.return_columns); + size_t output_rows = 0; + while (true) { + bool eof = false; + // Read one block from block reader + RETURN_NOT_OK_LOG( + reader.next_block_with_aggregation(&block, nullptr, nullptr, &eof), + "failed to read next block when merging rowsets of tablet " + tablet->full_name()); + if (eof) { + break; + } + RETURN_NOT_OK_LOG( + dst_rowset_writer->add_block(&block), + "failed to write block when merging rowsets of tablet " + tablet->full_name()); + output_rows += block.rows(); + block.clear_column_data(); + } + + if (stats_output != nullptr) { + stats_output->output_rows = output_rows; + stats_output->merged_rows = reader.merged_rows(); + stats_output->filtered_rows = reader.filtered_rows(); + } + + RETURN_NOT_OK_LOG( + dst_rowset_writer->flush(), + "failed to flush rowset when merging rowsets of tablet " + tablet->full_name()); + return OLAP_SUCCESS; +} + } // namespace doris diff --git a/be/src/olap/merger.h b/be/src/olap/merger.h index beef4fe9910850..35edf223eed6c6 100644 --- a/be/src/olap/merger.h +++ b/be/src/olap/merger.h @@ -39,6 +39,10 @@ class Merger { static OLAPStatus merge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, const std::vector& src_rowset_readers, RowsetWriter* dst_rowset_writer, Statistics* stats_output); + + static OLAPStatus vmerge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, + const std::vector& src_rowset_readers, + RowsetWriter* dst_rowset_writer, Statistics* stats_output); }; } // namespace doris diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index 07927ba10e4149..bdb4fee6c90129 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -309,6 +309,17 @@ OLAPStatus TabletReader::_init_return_columns(const ReaderParams& read_params) { } } VLOG_NOTICE << "return column is empty, using full column as default."; + } else if ((read_params.reader_type == READER_CUMULATIVE_COMPACTION || + read_params.reader_type == READER_BASE_COMPACTION) && + !read_params.return_columns.empty()) { + _return_columns = read_params.return_columns; + for (auto id : read_params.return_columns) { + if (_tablet->tablet_schema().column(id).is_key()) { + _key_cids.push_back(id); + } else { + _value_cids.push_back(id); + } + } } else if (read_params.reader_type == READER_CHECKSUM) { _return_columns = read_params.return_columns; for (auto id : read_params.return_columns) { diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index ae4ce2cb989708..931379ebb5541d 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -88,6 +88,70 @@ OLAPStatus BetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_conte return OLAP_SUCCESS; } +OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) { + if (PREDICT_FALSE(_segment_writer == nullptr)) { + RETURN_NOT_OK(_create_segment_writer(&_segment_writer)); + } + size_t block_size_in_bytes = block->bytes(); + size_t block_row_num = block->rows(); + size_t row_avg_size_in_bytes = std::max((size_t)1, block_size_in_bytes / block_row_num); + size_t row_offset = 0; + int64_t segment_capacity_in_bytes = 0; + int64_t segment_capacity_in_rows = 0; + auto refresh_segment_capacity = [&]() { + segment_capacity_in_bytes = + (int64_t)MAX_SEGMENT_SIZE - (int64_t)_segment_writer->estimate_segment_size(); + segment_capacity_in_rows = (int64_t)_context.max_rows_per_segment - + (int64_t)_segment_writer->num_rows_written(); + }; + + refresh_segment_capacity(); + if (PREDICT_FALSE(segment_capacity_in_bytes < row_avg_size_in_bytes || + segment_capacity_in_rows <= 0)) { + // no space for another signle row, need flush now + RETURN_NOT_OK(_flush_segment_writer(&_segment_writer)); + refresh_segment_capacity(); + } + + assert(segment_capacity_in_bytes > row_avg_size_in_bytes && segment_capacity_in_rows > 0); + if (block_size_in_bytes > segment_capacity_in_bytes || + block_row_num > segment_capacity_in_rows) { + size_t segment_max_row_num; + size_t input_row_num; + do { + assert(row_offset < block_row_num); + segment_max_row_num = + std::min((size_t)segment_capacity_in_bytes / row_avg_size_in_bytes, + (size_t)segment_capacity_in_rows); + input_row_num = std::min(segment_max_row_num, block_row_num - row_offset); + assert(input_row_num > 0); + auto s = _segment_writer->append_block(block, row_offset, input_row_num); + if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << "failed to append block: " << s.to_string(); + return OLAP_ERR_WRITER_DATA_WRITE_ERROR; + } + + refresh_segment_capacity(); + if (segment_capacity_in_bytes < row_avg_size_in_bytes || + segment_capacity_in_rows <= 0) { + RETURN_NOT_OK(_flush_segment_writer(&_segment_writer)); + refresh_segment_capacity(); + } + row_offset += input_row_num; + _num_rows_written += input_row_num; + } while (row_offset < block_row_num); + } else { + auto s = _segment_writer->append_block(block, 0, block_row_num); + if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << "failed to append block: " << s.to_string(); + return OLAP_ERR_WRITER_DATA_WRITE_ERROR; + } + refresh_segment_capacity(); + _num_rows_written += block_row_num; + } + return OLAP_SUCCESS; +} + template OLAPStatus BetaRowsetWriter::_add_row(const RowType& row) { if (PREDICT_FALSE(_segment_writer == nullptr)) { diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index 44c8917552c68e..eaf635af2a672e 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -43,6 +43,8 @@ class BetaRowsetWriter : public RowsetWriter { // For Memtable::flush() OLAPStatus add_row(const ContiguousRow& row) override { return _add_row(row); } + OLAPStatus add_block(const vectorized::Block* block) override; + // add rowset by create hard link OLAPStatus add_rowset(RowsetSharedPtr rowset) override; diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index fdbc2d7bf3aafe..12e99c812a9aa2 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -24,6 +24,7 @@ #include "olap/column_mapping.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_writer_context.h" +#include "vec/core/block.h" namespace doris { @@ -43,6 +44,10 @@ class RowsetWriter { virtual OLAPStatus add_row(const RowCursor& row) = 0; virtual OLAPStatus add_row(const ContiguousRow& row) = 0; + virtual OLAPStatus add_block(const vectorized::Block* block) { + return OLAP_ERR_FUNC_NOT_IMPLEMENTED; + } + // Precondition: the input `rowset` should have the same type of the rowset we're building virtual OLAPStatus add_rowset(RowsetSharedPtr rowset) = 0; diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 78327be905659f..00dbbe441dc530 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -191,6 +191,27 @@ Status ColumnWriter::append_nullable(const uint8_t* is_null_bits, const void* da return Status::OK(); } +Status ColumnWriter::append(const uint8_t* nullmap, const void* data, size_t num_rows) { + assert(data && num_rows > 0); + if (nullmap) { + size_t bitmap_size = BitmapSize(num_rows); + if (_null_bitmap.size() < bitmap_size) { + _null_bitmap.resize(bitmap_size); + } + uint8_t* bitmap_data = _null_bitmap.data(); + memset(bitmap_data, 0, bitmap_size); + for (size_t i = 0; i < num_rows; ++i) { + if (nullmap[i]) { + BitmapSet(bitmap_data, i); + } + } + return append_nullable(bitmap_data, data, num_rows); + } else { + const uint8_t* ptr = (const uint8_t*)data; + return append_data(&ptr, num_rows); + } +} + /////////////////////////////////////////////////////////////////////////////////// ScalarColumnWriter::ScalarColumnWriter(const ColumnWriterOptions& opts, diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 08eacbd66f5ebb..c689df080ee5bf 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -101,6 +101,8 @@ class ColumnWriter { return append_nullable(&nullmap, data, 1); } + Status append(const uint8_t* nullmap, const void* data, size_t num_rows); + Status append_nullable(const uint8_t* nullmap, const void* data, size_t num_rows); virtual Status append_nulls(size_t num_rows) = 0; @@ -141,6 +143,10 @@ class ColumnWriter { private: std::unique_ptr _field; bool _is_nullable; + std::vector _null_bitmap; + +protected: + std::shared_ptr _mem_tracker; }; class FlushPageCallback { diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 70509ad23439b6..168f6f1d723220 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -43,8 +43,15 @@ SegmentWriter::SegmentWriter(fs::WritableBlock* wblock, uint32_t segment_id, _opts(opts), _wblock(wblock), _mem_tracker( - MemTracker::create_virtual_tracker(-1, "SegmentWriter:Segment-" + std::to_string(segment_id))) { + MemTracker::create_virtual_tracker(-1, "SegmentWriter:Segment-" + std::to_string(segment_id))), + _olap_data_convertor(tablet_schema) { CHECK_NOTNULL(_wblock); + size_t num_short_key_column = _tablet_schema->num_short_key_columns(); + for (size_t cid = 0; cid < num_short_key_column; ++cid) { + const auto& column = _tablet_schema->column(cid); + _short_key_coders.push_back(get_key_coder(column.type())); + _short_key_index_size.push_back(column.index_length()); + } } SegmentWriter::~SegmentWriter() { @@ -99,6 +106,78 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec __attribute__((unused)) return Status::OK(); } +Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_pos, + size_t num_rows) { + assert(block && num_rows > 0 && row_pos + num_rows <= block->rows() && + block->columns() == _column_writers.size()); + _olap_data_convertor.set_source_content(block, row_pos, num_rows); + + // find all row pos for short key indexes + std::vector short_key_pos; + if (UNLIKELY(_short_key_row_pos == 0)) { + short_key_pos.push_back(0); + } + while (_short_key_row_pos + _opts.num_rows_per_block < _row_count + num_rows) { + _short_key_row_pos += _opts.num_rows_per_block; + short_key_pos.push_back(_short_key_row_pos - _row_count); + } + + // convert column data from engine format to storage layer format + std::vector short_key_columns; + size_t num_key_columns = _tablet_schema->num_short_key_columns(); + for (size_t cid = 0; cid < _column_writers.size(); ++cid) { + auto converted_result = _olap_data_convertor.convert_column_data(cid); + if (converted_result.first != Status::OK()) { + return converted_result.first; + } + if (cid < num_key_columns) { + short_key_columns.push_back(converted_result.second); + } + _column_writers[cid]->append(converted_result.second->get_nullmap(), + converted_result.second->get_data(), num_rows); + } + + // create short key indexes + std::vector key_column_fields; + for (const auto pos : short_key_pos) { + for (const auto& column : short_key_columns) { + key_column_fields.push_back(column->get_data_at(pos)); + } + std::string encoded_key = encode_short_keys(key_column_fields); + RETURN_IF_ERROR(_index_builder->add_item(encoded_key)); + key_column_fields.clear(); + } + + _row_count += num_rows; + _olap_data_convertor.clear_source_content(); + return Status::OK(); +} + +std::string SegmentWriter::encode_short_keys( + const std::vector key_column_fields, bool null_first) { + size_t num_key_columns = _tablet_schema->num_short_key_columns(); + assert(key_column_fields.size() == num_key_columns && + _short_key_coders.size() == num_key_columns && + _short_key_index_size.size() == num_key_columns); + + std::string encoded_keys; + for (size_t cid = 0; cid < num_key_columns; ++cid) { + const auto& field = key_column_fields[cid]; + if (field.null_flag) { + if (null_first) { + encoded_keys.push_back(KEY_NULL_FIRST_MARKER); + } else { + encoded_keys.push_back(KEY_NULL_LAST_MARKER); + } + continue; + } + encoded_keys.push_back(KEY_NORMAL_MARKER); + _short_key_coders[cid]->encode_ascending(field.value, _short_key_index_size[cid], + &encoded_keys); + } + return encoded_keys; +} + template Status SegmentWriter::append_row(const RowType& row) { for (size_t cid = 0; cid < _column_writers.size(); ++cid) { diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 77a66c85db3640..68211e6c03c759 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -25,6 +25,8 @@ #include "common/status.h" // Status #include "gen_cpp/segment_v2.pb.h" #include "gutil/macros.h" +#include "vec/core/block.h" +#include "vec/olap/olap_data_convertor.h" namespace doris { @@ -34,6 +36,7 @@ class RowCursor; class TabletSchema; class TabletColumn; class ShortKeyIndexBuilder; +class KeyCoder; namespace fs { class WritableBlock; @@ -61,6 +64,8 @@ class SegmentWriter { template Status append_row(const RowType& row); + Status append_block(const vectorized::Block* block, size_t row_pos, size_t num_rows); + uint64_t estimate_segment_size(); uint32_t num_rows_written() { return _row_count; } @@ -80,6 +85,9 @@ class SegmentWriter { Status _write_footer(); Status _write_raw_data(const std::vector& slices); + std::string encode_short_keys(const std::vector key_column_fields, + bool null_first = true); + private: uint32_t _segment_id; const TabletSchema* _tablet_schema; @@ -93,6 +101,11 @@ class SegmentWriter { std::vector> _column_writers; std::shared_ptr _mem_tracker; uint32_t _row_count = 0; + + vectorized::OlapBlockDataConvertor _olap_data_convertor; + std::vector< const KeyCoder* > _short_key_coders; + std::vector< uint16_t > _short_key_index_size; + size_t _short_key_row_pos = 0; }; } // namespace segment_v2 diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index e614490f2647ff..f3c7a1c775b423 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -172,6 +172,7 @@ set(VEC_FILES olap/vcollect_iterator.cpp olap/block_reader.cpp olap/vdelta_writer.cpp + olap/olap_data_convertor.cpp sink/mysql_result_writer.cpp sink/result_sink.cpp sink/vdata_stream_sender.cpp diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp new file mode 100644 index 00000000000000..70e593151138a6 --- /dev/null +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -0,0 +1,720 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/olap/olap_data_convertor.h" + +#include "vec/columns/column_complex.h" +#include "vec/columns/column_vector.h" + +namespace doris::vectorized { + +// class OlapBlockDataConvertor +OlapBlockDataConvertor::OlapBlockDataConvertor(const TabletSchema* tablet_schema) { + assert(tablet_schema); + const auto& columns = tablet_schema->columns(); + for (const auto& col : columns) { + switch (col.type()) { + case FieldType::OLAP_FIELD_TYPE_OBJECT: { + m_convertors.emplace_back(std::make_shared()); + break; + } + case FieldType::OLAP_FIELD_TYPE_HLL: { + m_convertors.emplace_back(std::make_shared()); + break; + } + case FieldType::OLAP_FIELD_TYPE_CHAR: { + m_convertors.emplace_back(std::make_shared(col.length())); + break; + } + case FieldType::OLAP_FIELD_TYPE_MAP: + case FieldType::OLAP_FIELD_TYPE_VARCHAR: { + m_convertors.emplace_back(std::make_shared(false)); + break; + } + case FieldType::OLAP_FIELD_TYPE_STRING: { + m_convertors.emplace_back(std::make_shared(true)); + break; + } + case FieldType::OLAP_FIELD_TYPE_DATE: { + m_convertors.emplace_back(std::make_shared()); + break; + } + case FieldType::OLAP_FIELD_TYPE_DATETIME: { + m_convertors.emplace_back(std::make_shared()); + break; + } + case FieldType::OLAP_FIELD_TYPE_DECIMAL: { + m_convertors.emplace_back(std::make_shared()); + break; + } + case FieldType::OLAP_FIELD_TYPE_BOOL: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + case FieldType::OLAP_FIELD_TYPE_TINYINT: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + case FieldType::OLAP_FIELD_TYPE_SMALLINT: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + case FieldType::OLAP_FIELD_TYPE_INT: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + case FieldType::OLAP_FIELD_TYPE_BIGINT: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + case FieldType::OLAP_FIELD_TYPE_LARGEINT: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + case FieldType::OLAP_FIELD_TYPE_FLOAT: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + case FieldType::OLAP_FIELD_TYPE_DOUBLE: { + m_convertors.emplace_back( + std::make_shared >()); + break; + } + default: { + DCHECK(false) << "Invalid type in RowBlockV2:" << col.type(); + } + } + } +} + +void OlapBlockDataConvertor::set_source_content(const vectorized::Block* block, size_t row_pos, + size_t num_rows) { + assert(block && num_rows > 0 && row_pos + num_rows <= block->rows() && + block->columns() == m_convertors.size()); + size_t cid = 0; + for (const auto& typed_column : *block) { + m_convertors[cid]->set_source_column(typed_column, row_pos, num_rows); + ++cid; + } +} + +void OlapBlockDataConvertor::clear_source_content() { + for (auto& convertor : m_convertors) { + convertor->clear_source_column(); + } +} + +std::pair OlapBlockDataConvertor::convert_column_data( + size_t cid) { + assert(cid < m_convertors.size()); + auto status = m_convertors[cid]->convert_to_olap(); + return {status, m_convertors[cid]}; +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorBase +void OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + assert(num_rows > 0 && row_pos + num_rows <= typed_column.column->size()); + m_typed_column = typed_column; + m_row_pos = row_pos; + m_num_rows = num_rows; +} + +void OlapBlockDataConvertor::OlapColumnDataConvertorBase::clear_source_column() { + // just to reduce the source column's ref count to 1 + m_typed_column.column = nullptr; +} + +const UInt8* OlapBlockDataConvertor::OlapColumnDataConvertorBase::get_nullmap() const { + assert(m_typed_column.column); + const UInt8* nullmap = nullptr; + if (m_typed_column.column->is_nullable()) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + nullmap = nullable_column->get_null_map_data().data(); + } + return nullmap; +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorObject +void OlapBlockDataConvertor::OlapColumnDataConvertorObject::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, + num_rows); + m_raw_data.clear(); + m_slice.resize(num_rows); +} + +const void* OlapBlockDataConvertor::OlapColumnDataConvertorObject::get_data() const { + return m_slice.data(); +} + +OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorObject::get_data_at( + size_t offset) const { + assert(offset < m_num_rows && m_num_rows == m_slice.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_slice.data() + offset}; +} + +Status OlapBlockDataConvertor::OlapColumnDataConvertorObject::convert_to_olap() { + assert(m_typed_column.column); + const vectorized::ColumnBitmap* column_bitmap = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_bitmap = assert_cast( + nullable_column->get_nested_column_ptr().get()); + } else { + column_bitmap = assert_cast(m_typed_column.column.get()); + } + + assert(column_bitmap); + BitmapValue* bitmap_value_cur = + const_cast(column_bitmap->get_data().data() + m_row_pos); + BitmapValue* bitmap_value_end = bitmap_value_cur + m_num_rows; + size_t slice_size; + size_t old_size; + char* raw_data; + Slice* slice = m_slice.data(); + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (bitmap_value_cur != bitmap_value_end) { + if (!*nullmap_cur) { + slice_size = bitmap_value_cur->getSizeInBytes(); + old_size = m_raw_data.size(); + m_raw_data.resize(old_size + slice_size); + + raw_data = m_raw_data.data() + old_size; + bitmap_value_cur->write(raw_data); + + slice->data = raw_data; + slice->size = slice_size; + } else { + // TODO: this may not be neccessary, check and remove later + slice->data = nullptr; + slice->size = 0; + } + ++slice; + ++nullmap_cur; + ++bitmap_value_cur; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + } else { + while (bitmap_value_cur != bitmap_value_end) { + slice_size = bitmap_value_cur->getSizeInBytes(); + old_size = m_raw_data.size(); + m_raw_data.resize(old_size + slice_size); + + raw_data = m_raw_data.data() + old_size; + bitmap_value_cur->write(raw_data); + + slice->data = raw_data; + slice->size = slice_size; + + ++slice; + ++bitmap_value_cur; + } + assert(slice == m_slice.get_end_ptr()); + } + return Status::OK(); +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorHLL +void OlapBlockDataConvertor::OlapColumnDataConvertorHLL::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, + num_rows); + m_raw_data.clear(); + m_slice.resize(num_rows); +} + +const void* OlapBlockDataConvertor::OlapColumnDataConvertorHLL::get_data() const { + return m_slice.data(); +} + +OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorHLL::get_data_at(size_t offset) const { + assert(offset < m_num_rows && m_num_rows == m_slice.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_slice.data() + offset}; +} + +Status OlapBlockDataConvertor::OlapColumnDataConvertorHLL::convert_to_olap() { + assert(m_typed_column.column); + const vectorized::ColumnHLL* column_hll = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_hll = assert_cast( + nullable_column->get_nested_column_ptr().get()); + } else { + column_hll = assert_cast(m_typed_column.column.get()); + } + + assert(column_hll); + HyperLogLog* hll_value_cur = + const_cast(column_hll->get_data().data() + m_row_pos); + HyperLogLog* hll_value_end = hll_value_cur + m_num_rows; + size_t slice_size; + size_t old_size; + char* raw_data; + Slice* slice = m_slice.data(); + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (hll_value_cur != hll_value_end) { + if (!*nullmap_cur) { + slice_size = hll_value_cur->max_serialized_size(); + old_size = m_raw_data.size(); + m_raw_data.resize(old_size + slice_size); + + raw_data = m_raw_data.data() + old_size; + slice_size = hll_value_cur->serialize((uint8_t*)raw_data); + m_raw_data.resize(old_size + slice_size); + + slice->data = raw_data; + slice->size = slice_size; + } else { + // TODO: this may not be neccessary, check and remove later + slice->data = nullptr; + slice->size = 0; + } + ++slice; + ++nullmap_cur; + ++hll_value_cur; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + } else { + while (hll_value_cur != hll_value_end) { + slice_size = hll_value_cur->max_serialized_size(); + old_size = m_raw_data.size(); + m_raw_data.resize(old_size + slice_size); + + raw_data = m_raw_data.data() + old_size; + slice_size = hll_value_cur->serialize((uint8_t*)raw_data); + m_raw_data.resize(old_size + slice_size); + + slice->data = raw_data; + slice->size = slice_size; + + ++slice; + ++hll_value_cur; + } + assert(slice == m_slice.get_end_ptr()); + } + return Status::OK(); +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorChar +OlapBlockDataConvertor::OlapColumnDataConvertorChar::OlapColumnDataConvertorChar(size_t length) + : m_length(length) { + assert(length > 0); +} + +void OlapBlockDataConvertor::OlapColumnDataConvertorChar::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, + num_rows); + m_raw_data.resize(m_length * num_rows); + memset(m_raw_data.data(), 0, m_length * num_rows); + m_slice.resize(num_rows); +} + +const void* OlapBlockDataConvertor::OlapColumnDataConvertorChar::get_data() const { + return m_slice.data(); +} + +OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorChar::get_data_at( + size_t offset) const { + assert(offset < m_num_rows && m_num_rows == m_slice.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_slice.data() + offset}; +} + +Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { + assert(m_typed_column.column); + const vectorized::ColumnString* column_string = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_string = assert_cast( + nullable_column->get_nested_column_ptr().get()); + } else { + column_string = assert_cast(m_typed_column.column.get()); + } + + assert(column_string); + + const ColumnString::Char* char_data = column_string->get_chars().data(); + const ColumnString::Offset* offset_cur = column_string->get_offsets().data() + m_row_pos; + const ColumnString::Offset* offset_end = offset_cur + m_num_rows; + char* raw_data = m_raw_data.data(); + Slice* slice = m_slice.data(); + size_t string_length; + size_t string_offset = *(offset_cur - 1); + size_t slice_size = m_length; + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (offset_cur != offset_end) { + if (!*nullmap_cur) { + string_length = *offset_cur - string_offset - 1; + assert(string_length <= slice_size); + memcpy(raw_data, char_data + string_offset, string_length); + + slice->data = raw_data; + slice->size = slice_size; + } else { + // TODO: this may not be neccessary, check and remove later + slice->data = nullptr; + slice->size = 0; + } + + string_offset = *offset_cur; + ++nullmap_cur; + ++slice; + ++offset_cur; + raw_data += slice_size; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + } else { + while (offset_cur != offset_end) { + string_length = *offset_cur - string_offset - 1; + assert(string_length <= slice_size); + memcpy(raw_data, char_data + string_offset, string_length); + + slice->data = raw_data; + slice->size = slice_size; + + string_offset = *offset_cur; + ++slice; + ++offset_cur; + raw_data += slice_size; + } + assert(slice == m_slice.get_end_ptr()); + } + return Status::OK(); +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorVarChar +OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::OlapColumnDataConvertorVarChar( + bool check_length) + : m_check_length(check_length) {} + +void OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, + num_rows); + m_slice.resize(num_rows); +} + +const void* OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::get_data() const { + return m_slice.data(); +} + +OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::get_data_at( + size_t offset) const { + assert(offset < m_num_rows && m_num_rows == m_slice.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_slice.data() + offset}; +} + +Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap() { + assert(m_typed_column.column); + const vectorized::ColumnString* column_string = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_string = assert_cast( + nullable_column->get_nested_column_ptr().get()); + } else { + column_string = assert_cast(m_typed_column.column.get()); + } + + assert(column_string); + + const char* char_data = (const char*)(column_string->get_chars().data()); + const ColumnString::Offset* offset_cur = column_string->get_offsets().data() + m_row_pos; + const ColumnString::Offset* offset_end = offset_cur + m_num_rows; + + Slice* slice = m_slice.data(); + size_t string_offset = *(offset_cur - 1); + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (offset_cur != offset_end) { + if (!*nullmap_cur) { + slice->data = const_cast(char_data + string_offset); + slice->size = *offset_cur - string_offset - 1; + if (UNLIKELY(slice->size > MAX_SIZE_OF_VEC_STRING && m_check_length)) { + return Status::NotSupported( + "Not support string len over than 1MB in vec engine."); + } + } else { + // TODO: this may not be neccessary, check and remove later + slice->data = nullptr; + slice->size = 0; + } + string_offset = *offset_cur; + ++nullmap_cur; + ++slice; + ++offset_cur; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + } else { + while (offset_cur != offset_end) { + slice->data = const_cast(char_data + string_offset); + slice->size = *offset_cur - string_offset - 1; + if (UNLIKELY(slice->size > MAX_SIZE_OF_VEC_STRING && m_check_length)) { + return Status::NotSupported("Not support string len over than 1MB in vec engine."); + } + string_offset = *offset_cur; + ++slice; + ++offset_cur; + } + assert(slice == m_slice.get_end_ptr()); + } + return Status::OK(); +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorDate +void OlapBlockDataConvertor::OlapColumnDataConvertorDate::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, + num_rows); + m_values.resize(num_rows); +} + +const void* OlapBlockDataConvertor::OlapColumnDataConvertorDate::get_data() const { + return m_values.data(); +} + +OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorDate::get_data_at( + size_t offset) const { + assert(offset < m_num_rows && m_num_rows == m_values.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_values.data() + offset}; +} + +Status OlapBlockDataConvertor::OlapColumnDataConvertorDate::convert_to_olap() { + assert(m_typed_column.column); + const vectorized::ColumnVector* column_datetime = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_datetime = assert_cast*>( + nullable_column->get_nested_column_ptr().get()); + } else { + column_datetime = assert_cast*>( + m_typed_column.column.get()); + } + + assert(column_datetime); + + const VecDateTimeValue* datetime_cur = + (const VecDateTimeValue*)(column_datetime->get_data().data()) + m_row_pos; + const VecDateTimeValue* datetime_end = datetime_cur + m_num_rows; + uint24_t* value = m_values.data(); + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (datetime_cur != datetime_end) { + if (!*nullmap_cur) { + *value = datetime_cur->to_olap_date(); + } else { + // do nothing + } + ++value; + ++datetime_cur; + ++nullmap_cur; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && value == m_values.get_end_ptr()); + } else { + while (datetime_cur != datetime_end) { + *value = datetime_cur->to_olap_date(); + ++value; + ++datetime_cur; + } + assert(value == m_values.get_end_ptr()); + } + return Status::OK(); +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorDateTime +void OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, + num_rows); + m_values.resize(num_rows); +} + +const void* OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::get_data() const { + return m_values.data(); +} + +OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::get_data_at( + size_t offset) const { + assert(offset < m_num_rows && m_num_rows == m_values.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_values.data() + offset}; +} + +Status OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::convert_to_olap() { + assert(m_typed_column.column); + const vectorized::ColumnVector* column_datetime = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_datetime = assert_cast*>( + nullable_column->get_nested_column_ptr().get()); + } else { + column_datetime = assert_cast*>( + m_typed_column.column.get()); + } + + assert(column_datetime); + + const VecDateTimeValue* datetime_cur = + (const VecDateTimeValue*)(column_datetime->get_data().data()) + m_row_pos; + const VecDateTimeValue* datetime_end = datetime_cur + m_num_rows; + uint64_t* value = m_values.data(); + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (datetime_cur != datetime_end) { + if (!*nullmap_cur) { + *value = datetime_cur->to_olap_datetime(); + } else { + // do nothing + } + ++value; + ++datetime_cur; + ++nullmap_cur; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && value == m_values.get_end_ptr()); + } else { + while (datetime_cur != datetime_end) { + *value = datetime_cur->to_olap_datetime(); + ++value; + ++datetime_cur; + } + assert(value == m_values.get_end_ptr()); + } + return Status::OK(); +} + +// class OlapBlockDataConvertor::OlapColumnDataConvertorDecimal +void OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::set_source_column( + const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, + num_rows); + m_values.resize(num_rows); +} + +const void* OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::get_data() const { + return m_values.data(); +} + +OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::get_data_at( + size_t offset) const { + assert(offset < m_num_rows && m_num_rows == m_values.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_values.data() + offset}; +} + +Status OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::convert_to_olap() { + assert(m_typed_column.column); + const vectorized::ColumnDecimal* column_decimal = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_decimal = assert_cast*>( + nullable_column->get_nested_column_ptr().get()); + } else { + column_decimal = assert_cast*>( + m_typed_column.column.get()); + } + + assert(column_decimal); + + const DecimalV2Value* decimal_cur = + (const DecimalV2Value*)(column_decimal->get_data().data()) + m_row_pos; + const DecimalV2Value* decimal_end = decimal_cur + m_num_rows; + decimal12_t* value = m_values.data(); + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (decimal_cur != decimal_end) { + if (!*nullmap_cur) { + value->integer = decimal_cur->int_value(); + value->fraction = decimal_cur->frac_value(); + } else { + // do nothing + } + ++value; + ++decimal_cur; + ++nullmap_cur; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && value == m_values.get_end_ptr()); + } else { + while (decimal_cur != decimal_end) { + value->integer = decimal_cur->int_value(); + value->fraction = decimal_cur->frac_value(); + ++value; + ++decimal_cur; + } + assert(value == m_values.get_end_ptr()); + } + return Status::OK(); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h new file mode 100644 index 00000000000000..002dfc419ebdfa --- /dev/null +++ b/be/src/vec/olap/olap_data_convertor.h @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include "olap/tablet_schema.h" +#include "vec/core/block.h" + +namespace doris::vectorized { + +struct OlapFieldData { + UInt8 null_flag; + const void* value; +}; + +class IOlapColumnDataAccessor { +public: + virtual const UInt8* get_nullmap() const = 0; + virtual const void* get_data() const = 0; + virtual OlapFieldData get_data_at(size_t offset) const = 0; + virtual ~IOlapColumnDataAccessor() {} +}; +using IOlapColumnDataAccessorSPtr = std::shared_ptr; + +class OlapBlockDataConvertor { +public: + OlapBlockDataConvertor(const TabletSchema* tablet_schema); + void set_source_content(const vectorized::Block* block, size_t row_pos, size_t num_rows); + void clear_source_content(); + std::pair convert_column_data(size_t cid); + +private: + // accessors for different data types; + class OlapColumnDataConvertorBase : public IOlapColumnDataAccessor { + public: + OlapColumnDataConvertorBase() = default; + virtual ~OlapColumnDataConvertorBase() = default; + OlapColumnDataConvertorBase(const OlapColumnDataConvertorBase&) = delete; + OlapColumnDataConvertorBase& operator=(const OlapColumnDataConvertorBase&) = delete; + OlapColumnDataConvertorBase(OlapColumnDataConvertorBase&&) = delete; + OlapColumnDataConvertorBase& operator=(OlapColumnDataConvertorBase&&) = delete; + + virtual void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows); + void clear_source_column(); + const UInt8* get_nullmap() const override; + virtual Status convert_to_olap() = 0; + + protected: + ColumnWithTypeAndName m_typed_column; + size_t m_row_pos; + size_t m_num_rows; + }; + using OlapColumnDataConvertorBaseSPtr = std::shared_ptr; + + class OlapColumnDataConvertorObject : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorObject() = default; + ~OlapColumnDataConvertorObject() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override; + const void* get_data() const override; + OlapFieldData get_data_at(size_t offset) const override; + Status convert_to_olap() override; + + private: + PaddedPODArray m_slice; + PaddedPODArray m_raw_data; + }; + + class OlapColumnDataConvertorHLL : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorHLL() = default; + ~OlapColumnDataConvertorHLL() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override; + const void* get_data() const override; + OlapFieldData get_data_at(size_t offset) const override; + Status convert_to_olap() override; + + private: + PaddedPODArray m_slice; + PaddedPODArray m_raw_data; + }; + + class OlapColumnDataConvertorChar : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorChar(size_t length); + ~OlapColumnDataConvertorChar() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override; + const void* get_data() const override; + OlapFieldData get_data_at(size_t offset) const override; + Status convert_to_olap() override; + + private: + size_t m_length; + PaddedPODArray m_slice; + PaddedPODArray m_raw_data; + }; + + class OlapColumnDataConvertorVarChar : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorVarChar(bool check_length); + ~OlapColumnDataConvertorVarChar() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override; + const void* get_data() const override; + OlapFieldData get_data_at(size_t offset) const override; + Status convert_to_olap() override; + + private: + bool m_check_length; + PaddedPODArray m_slice; + }; + + class OlapColumnDataConvertorDate : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorDate() = default; + ~OlapColumnDataConvertorDate() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override; + const void* get_data() const override; + OlapFieldData get_data_at(size_t offset) const override; + Status convert_to_olap() override; + + private: + PaddedPODArray m_values; + }; + + class OlapColumnDataConvertorDateTime : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorDateTime() = default; + ~OlapColumnDataConvertorDateTime() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override; + const void* get_data() const override; + OlapFieldData get_data_at(size_t offset) const override; + Status convert_to_olap() override; + + private: + PaddedPODArray m_values; + }; + + class OlapColumnDataConvertorDecimal : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorDecimal() = default; + ~OlapColumnDataConvertorDecimal() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override; + const void* get_data() const override; + OlapFieldData get_data_at(size_t offset) const override; + Status convert_to_olap() override; + + private: + PaddedPODArray m_values; + }; + + // class OlapColumnDataConvertorSimple for simple types, which don't need to do any convert, like int, float, double, etc... + template + class OlapColumnDataConvertorSimple : public OlapColumnDataConvertorBase { + public: + OlapColumnDataConvertorSimple() = default; + ~OlapColumnDataConvertorSimple() override = default; + + void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, + size_t num_rows) override { + OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column( + typed_column, row_pos, num_rows); + m_values.resize(num_rows); + } + + const void* get_data() const override { return m_values.data(); } + + OlapFieldData get_data_at(size_t offset) const override { + assert(offset < m_num_rows && m_num_rows == m_values.size()); + UInt8 null_flag = 0; + auto null_map = get_nullmap(); + if (null_map) { + null_flag = null_map[offset]; + } + return {null_flag, m_values.data() + offset}; + } + + Status convert_to_olap() override { + const vectorized::ColumnVector* column_data = nullptr; + const UInt8* nullmap = get_nullmap(); + if (nullmap) { + auto nullable_column = + assert_cast(m_typed_column.column.get()); + column_data = assert_cast*>( + nullable_column->get_nested_column_ptr().get()); + } else { + column_data = assert_cast*>( + m_typed_column.column.get()); + } + + assert(column_data); + + const T* data_cur = (const T*)(column_data->get_data().data()) + m_row_pos; + const T* data_end = data_cur + m_num_rows; + T* value = m_values.data(); + if (nullmap) { + const UInt8* nullmap_cur = nullmap + m_row_pos; + while (data_cur != data_end) { + if (!*nullmap_cur) { + *value = *data_cur; + } else { + // do nothing + } + ++value; + ++data_cur; + ++nullmap_cur; + } + assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && + value == m_values.get_end_ptr()); + } else { + while (data_cur != data_end) { + *value = *data_cur; + ++value; + ++data_cur; + } + assert(value == m_values.get_end_ptr()); + } + return Status::OK(); + } + + private: + PaddedPODArray m_values; + }; + +private: + std::vector m_convertors; +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/olap/vcollect_iterator.cpp b/be/src/vec/olap/vcollect_iterator.cpp index 18d80225d423f6..c64df3b1d2439b 100644 --- a/be/src/vec/olap/vcollect_iterator.cpp +++ b/be/src/vec/olap/vcollect_iterator.cpp @@ -212,7 +212,13 @@ OLAPStatus VCollectIterator::Level0Iterator::next(IteratorRowRef* ref) { } OLAPStatus VCollectIterator::Level0Iterator::next(Block* block) { - return _rs_reader->next_block(block); + if (UNLIKELY(_ref.block->rows() > 0 && _ref.row_pos == 0)) { + block->swap(*_ref.block); + _ref.row_pos = -1; + return OLAP_SUCCESS; + } else { + return _rs_reader->next_block(block); + } } VCollectIterator::Level1Iterator::Level1Iterator( @@ -224,6 +230,7 @@ VCollectIterator::Level1Iterator::Level1Iterator( _merge(merge), _skip_same(skip_same) { _ref.row_pos = -1; // represent eof + _batch_size = reader->_batch_size; } VCollectIterator::Level1Iterator::~Level1Iterator() { @@ -261,7 +268,11 @@ OLAPStatus VCollectIterator::Level1Iterator::next(Block* block) { if (UNLIKELY(_cur_child == nullptr)) { return OLAP_ERR_DATA_EOF; } - return _normal_next(block); + if (_merge) { + return _merge_next(block); + } else { + return _normal_next(block); + } } int64_t VCollectIterator::Level1Iterator::version() const { @@ -362,6 +373,37 @@ OLAPStatus VCollectIterator::Level1Iterator::_normal_next(IteratorRowRef* ref) { } } +OLAPStatus VCollectIterator::Level1Iterator::_merge_next(Block* block) { + int target_block_row = 0; + auto target_columns = block->mutate_columns(); + size_t column_count = block->columns(); + IteratorRowRef cur_row = _ref; + do { + const auto& src_block = cur_row.block; + assert(src_block->columns() == column_count); + for (size_t i = 0; i < column_count; ++i) { + target_columns[i]->insert_from(*(src_block->get_by_position(i).column), + cur_row.row_pos); + } + ++target_block_row; + auto res = _merge_next(&cur_row); + if (UNLIKELY(res == OLAP_ERR_DATA_EOF)) { + if (target_block_row > 0) { + return OLAP_SUCCESS; + } else { + return OLAP_ERR_DATA_EOF; + } + } + + if (UNLIKELY(res != OLAP_SUCCESS)) { + LOG(WARNING) << "next failed: " << res; + return res; + } + } while (target_block_row < _batch_size); + + return OLAP_SUCCESS; +} + OLAPStatus VCollectIterator::Level1Iterator::_normal_next(Block* block) { auto res = _cur_child->next(block); if (LIKELY(res == OLAP_SUCCESS)) { diff --git a/be/src/vec/olap/vcollect_iterator.h b/be/src/vec/olap/vcollect_iterator.h index 366d12658cfe47..261bd1a6f3f9b3 100644 --- a/be/src/vec/olap/vcollect_iterator.h +++ b/be/src/vec/olap/vcollect_iterator.h @@ -161,6 +161,8 @@ class VCollectIterator { inline OLAPStatus _normal_next(IteratorRowRef* ref); + inline OLAPStatus _merge_next(Block* block); + inline OLAPStatus _normal_next(Block* block); // Each LevelIterator corresponds to a rowset reader, @@ -181,6 +183,9 @@ class VCollectIterator { bool _skip_same; // used when `_merge == true` std::unique_ptr _heap; + + // batch size, get from TabletReader + int _batch_size; }; std::unique_ptr _inner_iter; From ae50b2691f72dae9614acf6580c5ec9e70baa430 Mon Sep 17 00:00:00 2001 From: starocean999 <12095047@qq.com> Date: Wed, 16 Mar 2022 15:39:25 +0800 Subject: [PATCH 24/32] modified based on pr comments --- be/src/olap/rowset/beta_rowset_writer.cpp | 14 +- .../olap/rowset/segment_v2/segment_writer.cpp | 10 +- .../olap/rowset/segment_v2/segment_writer.h | 2 +- be/src/vec/olap/olap_data_convertor.cpp | 371 +++++++++--------- be/src/vec/olap/olap_data_convertor.h | 110 ++---- 5 files changed, 229 insertions(+), 278 deletions(-) diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 931379ebb5541d..c5ec157136d0c5 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -89,7 +89,7 @@ OLAPStatus BetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_conte } OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) { - if (PREDICT_FALSE(_segment_writer == nullptr)) { + if (UNLIKELY(_segment_writer == nullptr)) { RETURN_NOT_OK(_create_segment_writer(&_segment_writer)); } size_t block_size_in_bytes = block->bytes(); @@ -106,10 +106,11 @@ OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) { }; refresh_segment_capacity(); - if (PREDICT_FALSE(segment_capacity_in_bytes < row_avg_size_in_bytes || + if (UNLIKELY(segment_capacity_in_bytes < row_avg_size_in_bytes || segment_capacity_in_rows <= 0)) { // no space for another signle row, need flush now RETURN_NOT_OK(_flush_segment_writer(&_segment_writer)); + RETURN_NOT_OK(_create_segment_writer(&_segment_writer)); refresh_segment_capacity(); } @@ -126,15 +127,16 @@ OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) { input_row_num = std::min(segment_max_row_num, block_row_num - row_offset); assert(input_row_num > 0); auto s = _segment_writer->append_block(block, row_offset, input_row_num); - if (PREDICT_FALSE(!s.ok())) { + if (UNLIKELY(!s.ok())) { LOG(WARNING) << "failed to append block: " << s.to_string(); return OLAP_ERR_WRITER_DATA_WRITE_ERROR; } refresh_segment_capacity(); - if (segment_capacity_in_bytes < row_avg_size_in_bytes || - segment_capacity_in_rows <= 0) { + if (LIKELY(segment_capacity_in_bytes < row_avg_size_in_bytes || + segment_capacity_in_rows <= 0)) { RETURN_NOT_OK(_flush_segment_writer(&_segment_writer)); + RETURN_NOT_OK(_create_segment_writer(&_segment_writer)); refresh_segment_capacity(); } row_offset += input_row_num; @@ -142,7 +144,7 @@ OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) { } while (row_offset < block_row_num); } else { auto s = _segment_writer->append_block(block, 0, block_row_num); - if (PREDICT_FALSE(!s.ok())) { + if (UNLIKELY(!s.ok())) { LOG(WARNING) << "failed to append block: " << s.to_string(); return OLAP_ERR_WRITER_DATA_WRITE_ERROR; } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 168f6f1d723220..8baf1c135aff29 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -138,7 +138,7 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po } // create short key indexes - std::vector key_column_fields; + std::vector key_column_fields; for (const auto pos : short_key_pos) { for (const auto& column : short_key_columns) { key_column_fields.push_back(column->get_data_at(pos)); @@ -154,7 +154,7 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po } std::string SegmentWriter::encode_short_keys( - const std::vector key_column_fields, bool null_first) { + const std::vector key_column_fields, bool null_first) { size_t num_key_columns = _tablet_schema->num_short_key_columns(); assert(key_column_fields.size() == num_key_columns && _short_key_coders.size() == num_key_columns && @@ -162,8 +162,8 @@ std::string SegmentWriter::encode_short_keys( std::string encoded_keys; for (size_t cid = 0; cid < num_key_columns; ++cid) { - const auto& field = key_column_fields[cid]; - if (field.null_flag) { + auto field = key_column_fields[cid]; + if (UNLIKELY(!field)) { if (null_first) { encoded_keys.push_back(KEY_NULL_FIRST_MARKER); } else { @@ -172,7 +172,7 @@ std::string SegmentWriter::encode_short_keys( continue; } encoded_keys.push_back(KEY_NORMAL_MARKER); - _short_key_coders[cid]->encode_ascending(field.value, _short_key_index_size[cid], + _short_key_coders[cid]->encode_ascending(field, _short_key_index_size[cid], &encoded_keys); } return encoded_keys; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 68211e6c03c759..9be6e1f1e08d9b 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -85,7 +85,7 @@ class SegmentWriter { Status _write_footer(); Status _write_raw_data(const std::vector& slices); - std::string encode_short_keys(const std::vector key_column_fields, + std::string encode_short_keys(const std::vector key_column_fields, bool null_first = true); private: diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index 70e593151138a6..7fc103cdbd5793 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -29,75 +29,75 @@ OlapBlockDataConvertor::OlapBlockDataConvertor(const TabletSchema* tablet_schema for (const auto& col : columns) { switch (col.type()) { case FieldType::OLAP_FIELD_TYPE_OBJECT: { - m_convertors.emplace_back(std::make_shared()); + _convertors.emplace_back(std::make_shared()); break; } case FieldType::OLAP_FIELD_TYPE_HLL: { - m_convertors.emplace_back(std::make_shared()); + _convertors.emplace_back(std::make_shared()); break; } case FieldType::OLAP_FIELD_TYPE_CHAR: { - m_convertors.emplace_back(std::make_shared(col.length())); + _convertors.emplace_back(std::make_shared(col.length())); break; } case FieldType::OLAP_FIELD_TYPE_MAP: case FieldType::OLAP_FIELD_TYPE_VARCHAR: { - m_convertors.emplace_back(std::make_shared(false)); + _convertors.emplace_back(std::make_shared(false)); break; } case FieldType::OLAP_FIELD_TYPE_STRING: { - m_convertors.emplace_back(std::make_shared(true)); + _convertors.emplace_back(std::make_shared(true)); break; } case FieldType::OLAP_FIELD_TYPE_DATE: { - m_convertors.emplace_back(std::make_shared()); + _convertors.emplace_back(std::make_shared()); break; } case FieldType::OLAP_FIELD_TYPE_DATETIME: { - m_convertors.emplace_back(std::make_shared()); + _convertors.emplace_back(std::make_shared()); break; } case FieldType::OLAP_FIELD_TYPE_DECIMAL: { - m_convertors.emplace_back(std::make_shared()); + _convertors.emplace_back(std::make_shared()); break; } case FieldType::OLAP_FIELD_TYPE_BOOL: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } case FieldType::OLAP_FIELD_TYPE_TINYINT: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } case FieldType::OLAP_FIELD_TYPE_SMALLINT: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } case FieldType::OLAP_FIELD_TYPE_INT: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } case FieldType::OLAP_FIELD_TYPE_BIGINT: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } case FieldType::OLAP_FIELD_TYPE_LARGEINT: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } case FieldType::OLAP_FIELD_TYPE_FLOAT: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } case FieldType::OLAP_FIELD_TYPE_DOUBLE: { - m_convertors.emplace_back( + _convertors.emplace_back( std::make_shared >()); break; } @@ -111,50 +111,49 @@ OlapBlockDataConvertor::OlapBlockDataConvertor(const TabletSchema* tablet_schema void OlapBlockDataConvertor::set_source_content(const vectorized::Block* block, size_t row_pos, size_t num_rows) { assert(block && num_rows > 0 && row_pos + num_rows <= block->rows() && - block->columns() == m_convertors.size()); + block->columns() == _convertors.size()); size_t cid = 0; for (const auto& typed_column : *block) { - m_convertors[cid]->set_source_column(typed_column, row_pos, num_rows); + _convertors[cid]->set_source_column(typed_column, row_pos, num_rows); ++cid; } } void OlapBlockDataConvertor::clear_source_content() { - for (auto& convertor : m_convertors) { + for (auto& convertor : _convertors) { convertor->clear_source_column(); } } std::pair OlapBlockDataConvertor::convert_column_data( size_t cid) { - assert(cid < m_convertors.size()); - auto status = m_convertors[cid]->convert_to_olap(); - return {status, m_convertors[cid]}; + assert(cid < _convertors.size()); + auto status = _convertors[cid]->convert_to_olap(); + return {status, _convertors[cid]}; } // class OlapBlockDataConvertor::OlapColumnDataConvertorBase void OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { assert(num_rows > 0 && row_pos + num_rows <= typed_column.column->size()); - m_typed_column = typed_column; - m_row_pos = row_pos; - m_num_rows = num_rows; + _typed_column = typed_column; + _row_pos = row_pos; + _num_rows = num_rows; + if (_typed_column.column->is_nullable()) { + auto nullable_column = + assert_cast(_typed_column.column.get()); + _nullmap = nullable_column->get_null_map_data().data(); + } } void OlapBlockDataConvertor::OlapColumnDataConvertorBase::clear_source_column() { // just to reduce the source column's ref count to 1 - m_typed_column.column = nullptr; + _typed_column.column = nullptr; } const UInt8* OlapBlockDataConvertor::OlapColumnDataConvertorBase::get_nullmap() const { - assert(m_typed_column.column); - const UInt8* nullmap = nullptr; - if (m_typed_column.column->is_nullable()) { - auto nullable_column = - assert_cast(m_typed_column.column.get()); - nullmap = nullable_column->get_null_map_data().data(); - } - return nullmap; + assert(_typed_column.column); + return _nullmap; } // class OlapBlockDataConvertor::OlapColumnDataConvertorObject @@ -162,55 +161,53 @@ void OlapBlockDataConvertor::OlapColumnDataConvertorObject::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, num_rows); - m_raw_data.clear(); - m_slice.resize(num_rows); + _raw_data.clear(); + _slice.resize(num_rows); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorObject::get_data() const { - return m_slice.data(); + return _slice.data(); } -OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorObject::get_data_at( +const void* OlapBlockDataConvertor::OlapColumnDataConvertorObject::get_data_at( size_t offset) const { - assert(offset < m_num_rows && m_num_rows == m_slice.size()); + assert(offset < _num_rows && _num_rows == _slice.size()); UInt8 null_flag = 0; - auto null_map = get_nullmap(); - if (null_map) { - null_flag = null_map[offset]; + if (_nullmap) { + null_flag = _nullmap[offset]; } - return {null_flag, m_slice.data() + offset}; + return null_flag ? nullptr : _slice.data() + offset; } Status OlapBlockDataConvertor::OlapColumnDataConvertorObject::convert_to_olap() { - assert(m_typed_column.column); + assert(_typed_column.column); const vectorized::ColumnBitmap* column_bitmap = nullptr; - const UInt8* nullmap = get_nullmap(); - if (nullmap) { + if (_nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_bitmap = assert_cast( nullable_column->get_nested_column_ptr().get()); } else { - column_bitmap = assert_cast(m_typed_column.column.get()); + column_bitmap = assert_cast(_typed_column.column.get()); } assert(column_bitmap); BitmapValue* bitmap_value_cur = - const_cast(column_bitmap->get_data().data() + m_row_pos); - BitmapValue* bitmap_value_end = bitmap_value_cur + m_num_rows; + const_cast(column_bitmap->get_data().data() + _row_pos); + BitmapValue* bitmap_value_end = bitmap_value_cur + _num_rows; size_t slice_size; size_t old_size; char* raw_data; - Slice* slice = m_slice.data(); - if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; + Slice* slice = _slice.data(); + if (_nullmap) { + const UInt8* nullmap_cur = _nullmap + _row_pos; while (bitmap_value_cur != bitmap_value_end) { if (!*nullmap_cur) { slice_size = bitmap_value_cur->getSizeInBytes(); - old_size = m_raw_data.size(); - m_raw_data.resize(old_size + slice_size); + old_size = _raw_data.size(); + _raw_data.resize(old_size + slice_size); - raw_data = m_raw_data.data() + old_size; + raw_data = _raw_data.data() + old_size; bitmap_value_cur->write(raw_data); slice->data = raw_data; @@ -224,14 +221,14 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorObject::convert_to_olap() ++nullmap_cur; ++bitmap_value_cur; } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + assert(nullmap_cur == _nullmap + _row_pos + _num_rows && slice == _slice.get_end_ptr()); } else { while (bitmap_value_cur != bitmap_value_end) { slice_size = bitmap_value_cur->getSizeInBytes(); - old_size = m_raw_data.size(); - m_raw_data.resize(old_size + slice_size); + old_size = _raw_data.size(); + _raw_data.resize(old_size + slice_size); - raw_data = m_raw_data.data() + old_size; + raw_data = _raw_data.data() + old_size; bitmap_value_cur->write(raw_data); slice->data = raw_data; @@ -240,7 +237,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorObject::convert_to_olap() ++slice; ++bitmap_value_cur; } - assert(slice == m_slice.get_end_ptr()); + assert(slice == _slice.get_end_ptr()); } return Status::OK(); } @@ -250,56 +247,56 @@ void OlapBlockDataConvertor::OlapColumnDataConvertorHLL::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, num_rows); - m_raw_data.clear(); - m_slice.resize(num_rows); + _raw_data.clear(); + _slice.resize(num_rows); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorHLL::get_data() const { - return m_slice.data(); + return _slice.data(); } -OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorHLL::get_data_at(size_t offset) const { - assert(offset < m_num_rows && m_num_rows == m_slice.size()); +const void* OlapBlockDataConvertor::OlapColumnDataConvertorHLL::get_data_at(size_t offset) const { + assert(offset < _num_rows && _num_rows == _slice.size()); UInt8 null_flag = 0; auto null_map = get_nullmap(); if (null_map) { null_flag = null_map[offset]; } - return {null_flag, m_slice.data() + offset}; + return null_flag ? nullptr : _slice.data() + offset; } Status OlapBlockDataConvertor::OlapColumnDataConvertorHLL::convert_to_olap() { - assert(m_typed_column.column); + assert(_typed_column.column); const vectorized::ColumnHLL* column_hll = nullptr; const UInt8* nullmap = get_nullmap(); if (nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_hll = assert_cast( nullable_column->get_nested_column_ptr().get()); } else { - column_hll = assert_cast(m_typed_column.column.get()); + column_hll = assert_cast(_typed_column.column.get()); } assert(column_hll); HyperLogLog* hll_value_cur = - const_cast(column_hll->get_data().data() + m_row_pos); - HyperLogLog* hll_value_end = hll_value_cur + m_num_rows; + const_cast(column_hll->get_data().data() + _row_pos); + HyperLogLog* hll_value_end = hll_value_cur + _num_rows; size_t slice_size; size_t old_size; char* raw_data; - Slice* slice = m_slice.data(); + Slice* slice = _slice.data(); if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; + const UInt8* nullmap_cur = nullmap + _row_pos; while (hll_value_cur != hll_value_end) { if (!*nullmap_cur) { slice_size = hll_value_cur->max_serialized_size(); - old_size = m_raw_data.size(); - m_raw_data.resize(old_size + slice_size); + old_size = _raw_data.size(); + _raw_data.resize(old_size + slice_size); - raw_data = m_raw_data.data() + old_size; + raw_data = _raw_data.data() + old_size; slice_size = hll_value_cur->serialize((uint8_t*)raw_data); - m_raw_data.resize(old_size + slice_size); + _raw_data.resize(old_size + slice_size); slice->data = raw_data; slice->size = slice_size; @@ -312,16 +309,16 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorHLL::convert_to_olap() { ++nullmap_cur; ++hll_value_cur; } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + assert(nullmap_cur == nullmap + _row_pos + _num_rows && slice == _slice.get_end_ptr()); } else { while (hll_value_cur != hll_value_end) { slice_size = hll_value_cur->max_serialized_size(); - old_size = m_raw_data.size(); - m_raw_data.resize(old_size + slice_size); + old_size = _raw_data.size(); + _raw_data.resize(old_size + slice_size); - raw_data = m_raw_data.data() + old_size; + raw_data = _raw_data.data() + old_size; slice_size = hll_value_cur->serialize((uint8_t*)raw_data); - m_raw_data.resize(old_size + slice_size); + _raw_data.resize(old_size + slice_size); slice->data = raw_data; slice->size = slice_size; @@ -329,14 +326,14 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorHLL::convert_to_olap() { ++slice; ++hll_value_cur; } - assert(slice == m_slice.get_end_ptr()); + assert(slice == _slice.get_end_ptr()); } return Status::OK(); } // class OlapBlockDataConvertor::OlapColumnDataConvertorChar OlapBlockDataConvertor::OlapColumnDataConvertorChar::OlapColumnDataConvertorChar(size_t length) - : m_length(length) { + : _length(length) { assert(length > 0); } @@ -344,51 +341,49 @@ void OlapBlockDataConvertor::OlapColumnDataConvertorChar::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, num_rows); - m_raw_data.resize(m_length * num_rows); - memset(m_raw_data.data(), 0, m_length * num_rows); - m_slice.resize(num_rows); + _raw_data.resize(_length * num_rows); + memset(_raw_data.data(), 0, _length * num_rows); + _slice.resize(num_rows); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorChar::get_data() const { - return m_slice.data(); + return _slice.data(); } -OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorChar::get_data_at( +const void* OlapBlockDataConvertor::OlapColumnDataConvertorChar::get_data_at( size_t offset) const { - assert(offset < m_num_rows && m_num_rows == m_slice.size()); + assert(offset < _num_rows && _num_rows == _slice.size()); UInt8 null_flag = 0; - auto null_map = get_nullmap(); - if (null_map) { - null_flag = null_map[offset]; + if (_nullmap) { + null_flag = _nullmap[offset]; } - return {null_flag, m_slice.data() + offset}; + return null_flag ? nullptr : _slice.data() + offset; } Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { - assert(m_typed_column.column); + assert(_typed_column.column); const vectorized::ColumnString* column_string = nullptr; - const UInt8* nullmap = get_nullmap(); - if (nullmap) { + if (_nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_string = assert_cast( nullable_column->get_nested_column_ptr().get()); } else { - column_string = assert_cast(m_typed_column.column.get()); + column_string = assert_cast(_typed_column.column.get()); } assert(column_string); const ColumnString::Char* char_data = column_string->get_chars().data(); - const ColumnString::Offset* offset_cur = column_string->get_offsets().data() + m_row_pos; - const ColumnString::Offset* offset_end = offset_cur + m_num_rows; - char* raw_data = m_raw_data.data(); - Slice* slice = m_slice.data(); + const ColumnString::Offset* offset_cur = column_string->get_offsets().data() + _row_pos; + const ColumnString::Offset* offset_end = offset_cur + _num_rows; + char* raw_data = _raw_data.data(); + Slice* slice = _slice.data(); size_t string_length; size_t string_offset = *(offset_cur - 1); - size_t slice_size = m_length; - if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; + size_t slice_size = _length; + if (_nullmap) { + const UInt8* nullmap_cur = _nullmap + _row_pos; while (offset_cur != offset_end) { if (!*nullmap_cur) { string_length = *offset_cur - string_offset - 1; @@ -409,7 +404,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { ++offset_cur; raw_data += slice_size; } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + assert(nullmap_cur == _nullmap + _row_pos + _num_rows && slice == _slice.get_end_ptr()); } else { while (offset_cur != offset_end) { string_length = *offset_cur - string_offset - 1; @@ -424,7 +419,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { ++offset_cur; raw_data += slice_size; } - assert(slice == m_slice.get_end_ptr()); + assert(slice == _slice.get_end_ptr()); } return Status::OK(); } @@ -432,58 +427,56 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorChar::convert_to_olap() { // class OlapBlockDataConvertor::OlapColumnDataConvertorVarChar OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::OlapColumnDataConvertorVarChar( bool check_length) - : m_check_length(check_length) {} + : _check_length(check_length) {} void OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, num_rows); - m_slice.resize(num_rows); + _slice.resize(num_rows); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::get_data() const { - return m_slice.data(); + return _slice.data(); } -OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::get_data_at( +const void* OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::get_data_at( size_t offset) const { - assert(offset < m_num_rows && m_num_rows == m_slice.size()); + assert(offset < _num_rows && _num_rows == _slice.size()); UInt8 null_flag = 0; - auto null_map = get_nullmap(); - if (null_map) { - null_flag = null_map[offset]; + if (_nullmap) { + null_flag = _nullmap[offset]; } - return {null_flag, m_slice.data() + offset}; + return null_flag ? nullptr : _slice.data() + offset; } Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap() { - assert(m_typed_column.column); + assert(_typed_column.column); const vectorized::ColumnString* column_string = nullptr; - const UInt8* nullmap = get_nullmap(); - if (nullmap) { + if (_nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_string = assert_cast( nullable_column->get_nested_column_ptr().get()); } else { - column_string = assert_cast(m_typed_column.column.get()); + column_string = assert_cast(_typed_column.column.get()); } assert(column_string); const char* char_data = (const char*)(column_string->get_chars().data()); - const ColumnString::Offset* offset_cur = column_string->get_offsets().data() + m_row_pos; - const ColumnString::Offset* offset_end = offset_cur + m_num_rows; + const ColumnString::Offset* offset_cur = column_string->get_offsets().data() + _row_pos; + const ColumnString::Offset* offset_end = offset_cur + _num_rows; - Slice* slice = m_slice.data(); + Slice* slice = _slice.data(); size_t string_offset = *(offset_cur - 1); - if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; + if (_nullmap) { + const UInt8* nullmap_cur = _nullmap + _row_pos; while (offset_cur != offset_end) { if (!*nullmap_cur) { slice->data = const_cast(char_data + string_offset); slice->size = *offset_cur - string_offset - 1; - if (UNLIKELY(slice->size > MAX_SIZE_OF_VEC_STRING && m_check_length)) { + if (UNLIKELY(slice->size > MAX_SIZE_OF_VEC_STRING && _check_length)) { return Status::NotSupported( "Not support string len over than 1MB in vec engine."); } @@ -497,19 +490,19 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap() ++slice; ++offset_cur; } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && slice == m_slice.get_end_ptr()); + assert(nullmap_cur == _nullmap + _row_pos + _num_rows && slice == _slice.get_end_ptr()); } else { while (offset_cur != offset_end) { slice->data = const_cast(char_data + string_offset); slice->size = *offset_cur - string_offset - 1; - if (UNLIKELY(slice->size > MAX_SIZE_OF_VEC_STRING && m_check_length)) { + if (UNLIKELY(slice->size > MAX_SIZE_OF_VEC_STRING && _check_length)) { return Status::NotSupported("Not support string len over than 1MB in vec engine."); } string_offset = *offset_cur; ++slice; ++offset_cur; } - assert(slice == m_slice.get_end_ptr()); + assert(slice == _slice.get_end_ptr()); } return Status::OK(); } @@ -519,46 +512,44 @@ void OlapBlockDataConvertor::OlapColumnDataConvertorDate::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, num_rows); - m_values.resize(num_rows); + _values.resize(num_rows); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorDate::get_data() const { - return m_values.data(); + return _values.data(); } -OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorDate::get_data_at( +const void* OlapBlockDataConvertor::OlapColumnDataConvertorDate::get_data_at( size_t offset) const { - assert(offset < m_num_rows && m_num_rows == m_values.size()); + assert(offset < _num_rows && _num_rows == _values.size()); UInt8 null_flag = 0; - auto null_map = get_nullmap(); - if (null_map) { - null_flag = null_map[offset]; + if (_nullmap) { + null_flag = _nullmap[offset]; } - return {null_flag, m_values.data() + offset}; + return null_flag ? nullptr : _values.data() + offset; } Status OlapBlockDataConvertor::OlapColumnDataConvertorDate::convert_to_olap() { - assert(m_typed_column.column); + assert(_typed_column.column); const vectorized::ColumnVector* column_datetime = nullptr; - const UInt8* nullmap = get_nullmap(); - if (nullmap) { + if (_nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_datetime = assert_cast*>( nullable_column->get_nested_column_ptr().get()); } else { column_datetime = assert_cast*>( - m_typed_column.column.get()); + _typed_column.column.get()); } assert(column_datetime); const VecDateTimeValue* datetime_cur = - (const VecDateTimeValue*)(column_datetime->get_data().data()) + m_row_pos; - const VecDateTimeValue* datetime_end = datetime_cur + m_num_rows; - uint24_t* value = m_values.data(); - if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; + (const VecDateTimeValue*)(column_datetime->get_data().data()) + _row_pos; + const VecDateTimeValue* datetime_end = datetime_cur + _num_rows; + uint24_t* value = _values.data(); + if (_nullmap) { + const UInt8* nullmap_cur = _nullmap + _row_pos; while (datetime_cur != datetime_end) { if (!*nullmap_cur) { *value = datetime_cur->to_olap_date(); @@ -569,14 +560,14 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorDate::convert_to_olap() { ++datetime_cur; ++nullmap_cur; } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && value == m_values.get_end_ptr()); + assert(nullmap_cur == _nullmap + _row_pos + _num_rows && value == _values.get_end_ptr()); } else { while (datetime_cur != datetime_end) { *value = datetime_cur->to_olap_date(); ++value; ++datetime_cur; } - assert(value == m_values.get_end_ptr()); + assert(value == _values.get_end_ptr()); } return Status::OK(); } @@ -586,46 +577,44 @@ void OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, num_rows); - m_values.resize(num_rows); + _values.resize(num_rows); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::get_data() const { - return m_values.data(); + return _values.data(); } -OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::get_data_at( +const void* OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::get_data_at( size_t offset) const { - assert(offset < m_num_rows && m_num_rows == m_values.size()); + assert(offset < _num_rows && _num_rows == _values.size()); UInt8 null_flag = 0; - auto null_map = get_nullmap(); - if (null_map) { - null_flag = null_map[offset]; + if (_nullmap) { + null_flag = _nullmap[offset]; } - return {null_flag, m_values.data() + offset}; + return null_flag ? nullptr : _values.data() + offset; } Status OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::convert_to_olap() { - assert(m_typed_column.column); + assert(_typed_column.column); const vectorized::ColumnVector* column_datetime = nullptr; - const UInt8* nullmap = get_nullmap(); - if (nullmap) { + if (_nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_datetime = assert_cast*>( nullable_column->get_nested_column_ptr().get()); } else { column_datetime = assert_cast*>( - m_typed_column.column.get()); + _typed_column.column.get()); } assert(column_datetime); const VecDateTimeValue* datetime_cur = - (const VecDateTimeValue*)(column_datetime->get_data().data()) + m_row_pos; - const VecDateTimeValue* datetime_end = datetime_cur + m_num_rows; - uint64_t* value = m_values.data(); - if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; + (const VecDateTimeValue*)(column_datetime->get_data().data()) + _row_pos; + const VecDateTimeValue* datetime_end = datetime_cur + _num_rows; + uint64_t* value = _values.data(); + if (_nullmap) { + const UInt8* nullmap_cur = _nullmap + _row_pos; while (datetime_cur != datetime_end) { if (!*nullmap_cur) { *value = datetime_cur->to_olap_datetime(); @@ -636,14 +625,14 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorDateTime::convert_to_olap( ++datetime_cur; ++nullmap_cur; } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && value == m_values.get_end_ptr()); + assert(nullmap_cur == _nullmap + _row_pos + _num_rows && value == _values.get_end_ptr()); } else { while (datetime_cur != datetime_end) { *value = datetime_cur->to_olap_datetime(); ++value; ++datetime_cur; } - assert(value == m_values.get_end_ptr()); + assert(value == _values.get_end_ptr()); } return Status::OK(); } @@ -653,46 +642,44 @@ void OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::set_source_column( const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) { OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column(typed_column, row_pos, num_rows); - m_values.resize(num_rows); + _values.resize(num_rows); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::get_data() const { - return m_values.data(); + return _values.data(); } -OlapFieldData OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::get_data_at( +const void* OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::get_data_at( size_t offset) const { - assert(offset < m_num_rows && m_num_rows == m_values.size()); + assert(offset < _num_rows && _num_rows == _values.size()); UInt8 null_flag = 0; - auto null_map = get_nullmap(); - if (null_map) { - null_flag = null_map[offset]; + if (_nullmap) { + null_flag = _nullmap[offset]; } - return {null_flag, m_values.data() + offset}; + return null_flag ? nullptr : _values.data() + offset; } Status OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::convert_to_olap() { - assert(m_typed_column.column); + assert(_typed_column.column); const vectorized::ColumnDecimal* column_decimal = nullptr; - const UInt8* nullmap = get_nullmap(); - if (nullmap) { + if (_nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_decimal = assert_cast*>( nullable_column->get_nested_column_ptr().get()); } else { column_decimal = assert_cast*>( - m_typed_column.column.get()); + _typed_column.column.get()); } assert(column_decimal); const DecimalV2Value* decimal_cur = - (const DecimalV2Value*)(column_decimal->get_data().data()) + m_row_pos; - const DecimalV2Value* decimal_end = decimal_cur + m_num_rows; - decimal12_t* value = m_values.data(); - if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; + (const DecimalV2Value*)(column_decimal->get_data().data()) + _row_pos; + const DecimalV2Value* decimal_end = decimal_cur + _num_rows; + decimal12_t* value = _values.data(); + if (_nullmap) { + const UInt8* nullmap_cur = _nullmap + _row_pos; while (decimal_cur != decimal_end) { if (!*nullmap_cur) { value->integer = decimal_cur->int_value(); @@ -704,7 +691,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::convert_to_olap() ++decimal_cur; ++nullmap_cur; } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && value == m_values.get_end_ptr()); + assert(nullmap_cur == _nullmap + _row_pos + _num_rows && value == _values.get_end_ptr()); } else { while (decimal_cur != decimal_end) { value->integer = decimal_cur->int_value(); @@ -712,7 +699,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorDecimal::convert_to_olap() ++value; ++decimal_cur; } - assert(value == m_values.get_end_ptr()); + assert(value == _values.get_end_ptr()); } return Status::OK(); } diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index 002dfc419ebdfa..dc0ac23c006e7d 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -21,16 +21,11 @@ namespace doris::vectorized { -struct OlapFieldData { - UInt8 null_flag; - const void* value; -}; - class IOlapColumnDataAccessor { public: virtual const UInt8* get_nullmap() const = 0; virtual const void* get_data() const = 0; - virtual OlapFieldData get_data_at(size_t offset) const = 0; + virtual const void* get_data_at(size_t offset) const = 0; virtual ~IOlapColumnDataAccessor() {} }; using IOlapColumnDataAccessorSPtr = std::shared_ptr; @@ -60,9 +55,10 @@ class OlapBlockDataConvertor { virtual Status convert_to_olap() = 0; protected: - ColumnWithTypeAndName m_typed_column; - size_t m_row_pos; - size_t m_num_rows; + ColumnWithTypeAndName _typed_column; + size_t _row_pos = 0; + size_t _num_rows = 0; + const UInt8* _nullmap = nullptr; }; using OlapColumnDataConvertorBaseSPtr = std::shared_ptr; @@ -74,12 +70,12 @@ class OlapBlockDataConvertor { void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) override; const void* get_data() const override; - OlapFieldData get_data_at(size_t offset) const override; + const void* get_data_at(size_t offset) const override; Status convert_to_olap() override; private: - PaddedPODArray m_slice; - PaddedPODArray m_raw_data; + PaddedPODArray _slice; + PaddedPODArray _raw_data; }; class OlapColumnDataConvertorHLL : public OlapColumnDataConvertorBase { @@ -90,12 +86,12 @@ class OlapBlockDataConvertor { void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) override; const void* get_data() const override; - OlapFieldData get_data_at(size_t offset) const override; + const void* get_data_at(size_t offset) const override; Status convert_to_olap() override; private: - PaddedPODArray m_slice; - PaddedPODArray m_raw_data; + PaddedPODArray _slice; + PaddedPODArray _raw_data; }; class OlapColumnDataConvertorChar : public OlapColumnDataConvertorBase { @@ -106,13 +102,13 @@ class OlapBlockDataConvertor { void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) override; const void* get_data() const override; - OlapFieldData get_data_at(size_t offset) const override; + const void* get_data_at(size_t offset) const override; Status convert_to_olap() override; private: - size_t m_length; - PaddedPODArray m_slice; - PaddedPODArray m_raw_data; + size_t _length; + PaddedPODArray _slice; + PaddedPODArray _raw_data; }; class OlapColumnDataConvertorVarChar : public OlapColumnDataConvertorBase { @@ -123,12 +119,12 @@ class OlapBlockDataConvertor { void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) override; const void* get_data() const override; - OlapFieldData get_data_at(size_t offset) const override; + const void* get_data_at(size_t offset) const override; Status convert_to_olap() override; private: - bool m_check_length; - PaddedPODArray m_slice; + bool _check_length; + PaddedPODArray _slice; }; class OlapColumnDataConvertorDate : public OlapColumnDataConvertorBase { @@ -139,11 +135,11 @@ class OlapBlockDataConvertor { void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) override; const void* get_data() const override; - OlapFieldData get_data_at(size_t offset) const override; + const void* get_data_at(size_t offset) const override; Status convert_to_olap() override; private: - PaddedPODArray m_values; + PaddedPODArray _values; }; class OlapColumnDataConvertorDateTime : public OlapColumnDataConvertorBase { @@ -154,11 +150,11 @@ class OlapBlockDataConvertor { void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) override; const void* get_data() const override; - OlapFieldData get_data_at(size_t offset) const override; + const void* get_data_at(size_t offset) const override; Status convert_to_olap() override; private: - PaddedPODArray m_values; + PaddedPODArray _values; }; class OlapColumnDataConvertorDecimal : public OlapColumnDataConvertorBase { @@ -169,11 +165,11 @@ class OlapBlockDataConvertor { void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, size_t num_rows) override; const void* get_data() const override; - OlapFieldData get_data_at(size_t offset) const override; + const void* get_data_at(size_t offset) const override; Status convert_to_olap() override; private: - PaddedPODArray m_values; + PaddedPODArray _values; }; // class OlapColumnDataConvertorSimple for simple types, which don't need to do any convert, like int, float, double, etc... @@ -183,74 +179,40 @@ class OlapBlockDataConvertor { OlapColumnDataConvertorSimple() = default; ~OlapColumnDataConvertorSimple() override = default; - void set_source_column(const ColumnWithTypeAndName& typed_column, size_t row_pos, - size_t num_rows) override { - OlapBlockDataConvertor::OlapColumnDataConvertorBase::set_source_column( - typed_column, row_pos, num_rows); - m_values.resize(num_rows); - } - - const void* get_data() const override { return m_values.data(); } + const void* get_data() const override { return _values; } - OlapFieldData get_data_at(size_t offset) const override { - assert(offset < m_num_rows && m_num_rows == m_values.size()); + const void* get_data_at(size_t offset) const override { + assert(offset < _num_rows); UInt8 null_flag = 0; - auto null_map = get_nullmap(); - if (null_map) { - null_flag = null_map[offset]; + if (_nullmap) { + null_flag = _nullmap[offset]; } - return {null_flag, m_values.data() + offset}; + return null_flag ? nullptr : _values + offset; } Status convert_to_olap() override { const vectorized::ColumnVector* column_data = nullptr; - const UInt8* nullmap = get_nullmap(); - if (nullmap) { + if (_nullmap) { auto nullable_column = - assert_cast(m_typed_column.column.get()); + assert_cast(_typed_column.column.get()); column_data = assert_cast*>( nullable_column->get_nested_column_ptr().get()); } else { column_data = assert_cast*>( - m_typed_column.column.get()); + _typed_column.column.get()); } assert(column_data); - - const T* data_cur = (const T*)(column_data->get_data().data()) + m_row_pos; - const T* data_end = data_cur + m_num_rows; - T* value = m_values.data(); - if (nullmap) { - const UInt8* nullmap_cur = nullmap + m_row_pos; - while (data_cur != data_end) { - if (!*nullmap_cur) { - *value = *data_cur; - } else { - // do nothing - } - ++value; - ++data_cur; - ++nullmap_cur; - } - assert(nullmap_cur == nullmap + m_row_pos + m_num_rows && - value == m_values.get_end_ptr()); - } else { - while (data_cur != data_end) { - *value = *data_cur; - ++value; - ++data_cur; - } - assert(value == m_values.get_end_ptr()); - } + _values = (const T*)(column_data->get_data().data()) + _row_pos; return Status::OK(); } private: - PaddedPODArray m_values; + const T* _values = nullptr; }; private: - std::vector m_convertors; + std::vector _convertors; }; } // namespace doris::vectorized From 3a910e8734a67a31456395486d1c263cb3617a1d Mon Sep 17 00:00:00 2001 From: "minghong.zhou" Date: Fri, 18 Mar 2022 12:55:39 +0800 Subject: [PATCH 25/32] memtable _vflush() skip empty block --- be/src/olap/memtable.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 95bfc75b7cd58b..0d4dfc185eada0 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -308,6 +308,11 @@ void dump(const vectorized::Block& block, int64_t tablet_id) { } OLAPStatus MemTable::_vflush(){ + //skip empty tablet + if (_rows == 0) + { + return OLAP_SUCCESS; + } VLOG_CRITICAL << "begin to flush memtable for tablet: " << _tablet_id << ", memsize: " << memory_usage() << ", rows: " << _rows; size_t _flush_size = 0; From f9ab0139a830b01035f2a3312b5f7d908a6f15d9 Mon Sep 17 00:00:00 2001 From: jacktengg Date: Fri, 18 Mar 2022 14:13:13 +0800 Subject: [PATCH 26/32] fix compile errors --- be/src/olap/memtable.cpp | 2 +- be/src/olap/rowset/beta_rowset_writer.cpp | 3 +++ be/src/vec/exec/vbroker_scan_node.cpp | 2 +- be/src/vec/runtime/vtablets_channel.cpp | 1 + be/src/vec/sink/vtablet_sink.cpp | 2 +- 5 files changed, 7 insertions(+), 3 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 0d4dfc185eada0..6dc19da445aeff 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -136,7 +136,7 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num size_t oldsize = block->allocated_bytes(); _input_mutable_block.add_rows(block, row_pos, num_rows); size_t newsize = block->allocated_bytes(); - _mem_tracker->consume(newsize - oldsize); + _mem_tracker->Consume(newsize - oldsize); for(int i = 0; i < num_rows; i++){ RowInBlock* row_in_block_ptr = new RowInBlock(cursor_in_mutableblock + i); diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index c5ec157136d0c5..e60d6abb02314d 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -89,6 +89,9 @@ OLAPStatus BetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_conte } OLAPStatus BetaRowsetWriter::add_block(const vectorized::Block* block) { + if (block->rows() == 0) { + return OLAP_SUCCESS; + } if (UNLIKELY(_segment_writer == nullptr)) { RETURN_NOT_OK(_create_segment_writer(&_segment_writer)); } diff --git a/be/src/vec/exec/vbroker_scan_node.cpp b/be/src/vec/exec/vbroker_scan_node.cpp index 9cb919228d0043..bc20bf05e8c78a 100644 --- a/be/src/vec/exec/vbroker_scan_node.cpp +++ b/be/src/vec/exec/vbroker_scan_node.cpp @@ -168,7 +168,7 @@ Status VBrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, // 1. too many batches in queue, or // 2. at least one batch in queue and memory exceed limit. (_block_queue.size() >= _max_buffered_batches || - (mem_tracker()->any_limit_exceeded() && !_block_queue.empty()))) { + (mem_tracker()->AnyLimitExceeded(MemLimit::HARD) && !_block_queue.empty()))) { _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); } // Process already set failed, so we just return OK diff --git a/be/src/vec/runtime/vtablets_channel.cpp b/be/src/vec/runtime/vtablets_channel.cpp index 228f8f6f55f96b..1840b0d2be18d7 100644 --- a/be/src/vec/runtime/vtablets_channel.cpp +++ b/be/src/vec/runtime/vtablets_channel.cpp @@ -56,6 +56,7 @@ Status VTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& reques wrequest.txn_id = _txn_id; wrequest.partition_id = tablet.partition_id(); wrequest.load_id = request.id(); + wrequest.need_gen_rollup = request.need_gen_rollup(); wrequest.slots = index_slots; wrequest.is_high_priority = _is_high_priority; diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index b27d76a2523f5f..a3ace9da8abdf8 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -137,7 +137,7 @@ Status VNodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && + while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); From f007e993b6d784bbb719092d4972314f019c41e0 Mon Sep 17 00:00:00 2001 From: Zhou Minghong Date: Fri, 18 Mar 2022 14:55:49 +0800 Subject: [PATCH 27/32] fix compile error --- be/src/runtime/tablets_channel.cpp | 2 +- be/src/vec/runtime/vtablets_channel.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index e281892526bdf8..b02287b91598ea 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -76,7 +76,7 @@ Status TabletsChannel::open(const PTabletWriterOpenRequest& request) { Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { DCHECK(request.tablet_ids_size() == request.row_batch().num_rows()); - int64_t cur_seq; + int64_t cur_seq = 0; auto status = _get_current_seq(cur_seq, request); if (UNLIKELY(!status.ok())) { diff --git a/be/src/vec/runtime/vtablets_channel.cpp b/be/src/vec/runtime/vtablets_channel.cpp index 1840b0d2be18d7..6d37677ef92540 100644 --- a/be/src/vec/runtime/vtablets_channel.cpp +++ b/be/src/vec/runtime/vtablets_channel.cpp @@ -79,7 +79,7 @@ Status VTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& reques Status VTabletsChannel::add_block(const PTabletWriterAddBlockRequest& request, PTabletWriterAddBlockResult* response) { - int64_t cur_seq; + int64_t cur_seq = 0; auto status = _get_current_seq(cur_seq, request); if (UNLIKELY(!status.ok())) { From 6e141ae81ef8d11ccb4893e1cd7c8cfb974f34c7 Mon Sep 17 00:00:00 2001 From: Zhou Minghong Date: Fri, 18 Mar 2022 16:05:31 +0800 Subject: [PATCH 28/32] replace agg by last item --- be/src/olap/memtable.cpp | 8 +++++- .../aggregate_function_reader.cpp | 4 +++ .../aggregate_function_window.cpp | 27 +++++++++++++++++++ .../aggregate_function_window.h | 15 +++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 6dc19da445aeff..c89ab9499c336b 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -75,7 +75,13 @@ void MemTable::_init_agg_functions(const vectorized::Block* block) ->column(cid) .aggregation(); std::string agg_name = - TabletColumn::get_string_by_aggregation_type(agg_method) + "_reader"; + TabletColumn::get_string_by_aggregation_type(agg_method); + if (agg_name=="replace"){ + agg_name += "_last"; + } + agg_name += "_reader"; + + std::transform(agg_name.begin(), agg_name.end(), agg_name.begin(), [](unsigned char c) { return std::tolower(c); }); diff --git a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp index ce78397794fb4e..1e0de0fcfaeec7 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp @@ -31,6 +31,7 @@ void register_aggregate_function_reader(AggregateFunctionSimpleFactory& factory) register_function_reader("max", create_aggregate_function_max); register_function_reader("min", create_aggregate_function_min); register_function_reader("replace_if_not_null", create_aggregate_function_replace_if_not_null); + register_function_reader("replace_last_if_not_null", create_aggregate_function_replace_last_if_not_null); register_function_reader("bitmap_union", create_aggregate_function_bitmap_union); register_function_reader("hll_union", create_aggregate_function_HLL_union); } @@ -43,6 +44,9 @@ void register_aggregate_function_reader_no_spread(AggregateFunctionSimpleFactory register_function_reader("replace", create_aggregate_function_replace, false); register_function_reader("replace", create_aggregate_function_replace_nullable, true); + + register_function_reader("replace_last", create_aggregate_function_replace_last, false); + register_function_reader("replace_last", create_aggregate_function_replace_last_nullable, true); } } // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_window.cpp b/be/src/vec/aggregate_functions/aggregate_function_window.cpp index c40f2bce53912d..27090fe3185229 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_window.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_window.cpp @@ -139,6 +139,15 @@ AggregateFunctionPtr create_aggregate_function_replace_if_not_null(const std::st name, argument_types, parameters)); } +AggregateFunctionPtr create_aggregate_function_replace_last_if_not_null(const std::string& name, + const DataTypes& argument_types, + const Array& parameters, + const bool result_is_nullable) { + return AggregateFunctionPtr( + create_function_single_value( + name, argument_types, parameters)); +} + AggregateFunctionPtr create_aggregate_function_replace(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -148,6 +157,15 @@ AggregateFunctionPtr create_aggregate_function_replace(const std::string& name, name, argument_types, parameters)); } +AggregateFunctionPtr create_aggregate_function_replace_last(const std::string& name, + const DataTypes& argument_types, + const Array& parameters, + const bool result_is_nullable) { + return AggregateFunctionPtr( + create_function_single_value( + name, argument_types, parameters)); +} + AggregateFunctionPtr create_aggregate_function_replace_nullable(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -157,6 +175,15 @@ AggregateFunctionPtr create_aggregate_function_replace_nullable(const std::strin name, argument_types, parameters)); } +AggregateFunctionPtr create_aggregate_function_replace_last_nullable(const std::string& name, + const DataTypes& argument_types, + const Array& parameters, + const bool result_is_nullable) { + return AggregateFunctionPtr( + create_function_single_value( + name, argument_types, parameters)); +} + void register_aggregate_function_window_rank(AggregateFunctionSimpleFactory& factory) { factory.register_function("dense_rank", create_aggregate_function_dense_rank); factory.register_function("rank", create_aggregate_function_rank); diff --git a/be/src/vec/aggregate_functions/aggregate_function_window.h b/be/src/vec/aggregate_functions/aggregate_function_window.h index d7e75981d03f7a..545339e601e784 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_window.h +++ b/be/src/vec/aggregate_functions/aggregate_function_window.h @@ -420,4 +420,19 @@ AggregateFunctionPtr create_aggregate_function_replace_nullable(const std::strin const Array& parameters, const bool result_is_nullable); +AggregateFunctionPtr create_aggregate_function_replace_last_if_not_null(const std::string& name, + const DataTypes& argument_types, + const Array& parameters, + const bool result_is_nullable); + +AggregateFunctionPtr create_aggregate_function_replace_last(const std::string& name, + const DataTypes& argument_types, + const Array& parameters, + const bool result_is_nullable); + +AggregateFunctionPtr create_aggregate_function_replace_last_nullable(const std::string& name, + const DataTypes& argument_types, + const Array& parameters, + const bool result_is_nullable); + } // namespace doris::vectorized From 24fa448f84263aa2b4566a632d1a94f6dffcf690 Mon Sep 17 00:00:00 2001 From: Zhou Minghong Date: Fri, 18 Mar 2022 21:40:54 +0800 Subject: [PATCH 29/32] memtable mem usage --- be/src/olap/memtable.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index c89ab9499c336b..886a58c10b712a 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -139,9 +139,9 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num } } size_t cursor_in_mutableblock = _input_mutable_block.rows(); - size_t oldsize = block->allocated_bytes(); + size_t oldsize = _input_mutable_block.allocated_bytes(); _input_mutable_block.add_rows(block, row_pos, num_rows); - size_t newsize = block->allocated_bytes(); + size_t newsize = _input_mutable_block.allocated_bytes(); _mem_tracker->Consume(newsize - oldsize); for(int i = 0; i < num_rows; i++){ From 06a2292cc707a1d76d1ae888fc17bbfaaeda325d Mon Sep 17 00:00:00 2001 From: Zhou Minghong Date: Mon, 21 Mar 2022 09:34:28 +0800 Subject: [PATCH 30/32] using last_value function for replace aggregation in stream load --- be/src/olap/memtable.cpp | 13 +++++---- be/src/olap/memtable.h | 1 + .../aggregate_function_reader.cpp | 4 --- .../aggregate_function_window.cpp | 27 ------------------- .../aggregate_function_window.h | 15 ----------- 5 files changed, 9 insertions(+), 51 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 886a58c10b712a..ef7c807877d94a 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -45,7 +45,8 @@ MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet _schema_size(_schema->schema_size()), _rowset_writer(rowset_writer), _is_first_insertion(true), - _agg_functions(schema->num_columns()){ + _agg_functions(schema->num_columns()), + _mem_usage(0){ if (support_vec){ _skip_list = nullptr; _vec_row_comparator = std::make_shared(_schema); @@ -76,12 +77,12 @@ void MemTable::_init_agg_functions(const vectorized::Block* block) .aggregation(); std::string agg_name = TabletColumn::get_string_by_aggregation_type(agg_method); - if (agg_name=="replace"){ - agg_name += "_last"; + if (agg_name=="REPLACE"){ + agg_name = "last_value"; + }else{ + agg_name += "_reader"; } - agg_name += "_reader"; - std::transform(agg_name.begin(), agg_name.end(), agg_name.begin(), [](unsigned char c) { return std::tolower(c); }); @@ -109,6 +110,7 @@ MemTable::~MemTable() { delete row; } } + _mem_tracker->Release(_mem_usage); } MemTable::RowCursorComparator::RowCursorComparator(const Schema* schema) : _schema(schema) {} @@ -142,6 +144,7 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num size_t oldsize = _input_mutable_block.allocated_bytes(); _input_mutable_block.add_rows(block, row_pos, num_rows); size_t newsize = _input_mutable_block.allocated_bytes(); + _mem_usage += newsize - oldsize; _mem_tracker->Consume(newsize - oldsize); for(int i = 0; i < num_rows; i++){ diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index e48bf58314cde0..22d683dcfe6faa 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -201,6 +201,7 @@ class MemTable { void _init_agg_functions(const vectorized::Block* block); std::vector _agg_functions; std::vector rowInBlocks; + size_t _mem_usage; }; // class MemTable diff --git a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp index 1e0de0fcfaeec7..ce78397794fb4e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp @@ -31,7 +31,6 @@ void register_aggregate_function_reader(AggregateFunctionSimpleFactory& factory) register_function_reader("max", create_aggregate_function_max); register_function_reader("min", create_aggregate_function_min); register_function_reader("replace_if_not_null", create_aggregate_function_replace_if_not_null); - register_function_reader("replace_last_if_not_null", create_aggregate_function_replace_last_if_not_null); register_function_reader("bitmap_union", create_aggregate_function_bitmap_union); register_function_reader("hll_union", create_aggregate_function_HLL_union); } @@ -44,9 +43,6 @@ void register_aggregate_function_reader_no_spread(AggregateFunctionSimpleFactory register_function_reader("replace", create_aggregate_function_replace, false); register_function_reader("replace", create_aggregate_function_replace_nullable, true); - - register_function_reader("replace_last", create_aggregate_function_replace_last, false); - register_function_reader("replace_last", create_aggregate_function_replace_last_nullable, true); } } // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_window.cpp b/be/src/vec/aggregate_functions/aggregate_function_window.cpp index 27090fe3185229..c40f2bce53912d 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_window.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_window.cpp @@ -139,15 +139,6 @@ AggregateFunctionPtr create_aggregate_function_replace_if_not_null(const std::st name, argument_types, parameters)); } -AggregateFunctionPtr create_aggregate_function_replace_last_if_not_null(const std::string& name, - const DataTypes& argument_types, - const Array& parameters, - const bool result_is_nullable) { - return AggregateFunctionPtr( - create_function_single_value( - name, argument_types, parameters)); -} - AggregateFunctionPtr create_aggregate_function_replace(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -157,15 +148,6 @@ AggregateFunctionPtr create_aggregate_function_replace(const std::string& name, name, argument_types, parameters)); } -AggregateFunctionPtr create_aggregate_function_replace_last(const std::string& name, - const DataTypes& argument_types, - const Array& parameters, - const bool result_is_nullable) { - return AggregateFunctionPtr( - create_function_single_value( - name, argument_types, parameters)); -} - AggregateFunctionPtr create_aggregate_function_replace_nullable(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -175,15 +157,6 @@ AggregateFunctionPtr create_aggregate_function_replace_nullable(const std::strin name, argument_types, parameters)); } -AggregateFunctionPtr create_aggregate_function_replace_last_nullable(const std::string& name, - const DataTypes& argument_types, - const Array& parameters, - const bool result_is_nullable) { - return AggregateFunctionPtr( - create_function_single_value( - name, argument_types, parameters)); -} - void register_aggregate_function_window_rank(AggregateFunctionSimpleFactory& factory) { factory.register_function("dense_rank", create_aggregate_function_dense_rank); factory.register_function("rank", create_aggregate_function_rank); diff --git a/be/src/vec/aggregate_functions/aggregate_function_window.h b/be/src/vec/aggregate_functions/aggregate_function_window.h index 545339e601e784..d7e75981d03f7a 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_window.h +++ b/be/src/vec/aggregate_functions/aggregate_function_window.h @@ -420,19 +420,4 @@ AggregateFunctionPtr create_aggregate_function_replace_nullable(const std::strin const Array& parameters, const bool result_is_nullable); -AggregateFunctionPtr create_aggregate_function_replace_last_if_not_null(const std::string& name, - const DataTypes& argument_types, - const Array& parameters, - const bool result_is_nullable); - -AggregateFunctionPtr create_aggregate_function_replace_last(const std::string& name, - const DataTypes& argument_types, - const Array& parameters, - const bool result_is_nullable); - -AggregateFunctionPtr create_aggregate_function_replace_last_nullable(const std::string& name, - const DataTypes& argument_types, - const Array& parameters, - const bool result_is_nullable); - } // namespace doris::vectorized From 1c3c547df019a5d9c144675b7edac9a65f77a4b4 Mon Sep 17 00:00:00 2001 From: jacktengg Date: Mon, 21 Mar 2022 16:34:22 +0800 Subject: [PATCH 31/32] add config items for vectorized stream load --- be/src/common/config.h | 1 + be/src/runtime/exec_env_init.cpp | 6 +++++- .../src/main/java/org/apache/doris/common/Config.java | 3 +++ .../java/org/apache/doris/planner/StreamLoadPlanner.java | 5 ++++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index fe92b6b3ea44e0..f6987e746904b7 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -731,6 +731,7 @@ CONF_mInt32(string_type_length_soft_limit_bytes, "1048576"); CONF_Validator(string_type_length_soft_limit_bytes, [](const int config) -> bool { return config > 0 && config <= 2147483643; }); +CONF_mBool(enable_vectorized_load, "false"); } // namespace config diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 4d347c16bb270d..a0964afbcf14dd 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -143,7 +143,11 @@ Status ExecEnv::_init(const std::vector& store_paths) { _tmp_file_mgr = new TmpFileMgr(this); _bfd_parser = BfdParser::create(); _broker_mgr = new BrokerMgr(this); - _load_channel_mgr = new vectorized::VLoadChannelMgr(); + if (config::enable_vectorized_load) { + _load_channel_mgr = new vectorized::VLoadChannelMgr(); + } else { + _load_channel_mgr = new LoadChannelMgr(); + } _load_stream_mgr = new LoadStreamMgr(); _internal_client_cache = new BrpcClientCache(); _function_client_cache = new BrpcClientCache(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java index 15516769109b59..7e8acfa2bc2b18 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java @@ -1650,4 +1650,7 @@ public class Config extends ConfigBase { */ @ConfField(mutable = true) public static boolean skip_compaction_slower_replica = true; + + @ConfField + public static boolean enable_vectorized_load = false; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java index 761cbf9fcec05d..4862de3d06669f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java @@ -33,6 +33,7 @@ import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.PartitionItem; import org.apache.doris.catalog.PartitionType; +import org.apache.doris.common.Config; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.DdlException; import org.apache.doris.common.ErrorCode; @@ -192,7 +193,9 @@ public TExecPlanFragmentParams plan(TUniqueId loadId) throws UserException { queryOptions.setMemLimit(taskInfo.getMemLimit()); // for stream load, we use exec_mem_limit to limit the memory usage of load channel. queryOptions.setLoadMemLimit(taskInfo.getMemLimit()); - queryOptions.setEnableVectorizedEngine(true); + if (Config.enable_vectorized_load) { + queryOptions.setEnableVectorizedEngine(true); + } params.setQueryOptions(queryOptions); TQueryGlobals queryGlobals = new TQueryGlobals(); queryGlobals.setNowString(DATE_FORMAT.format(new Date())); From e291a4603c5b3aceb2a8fa5bfdb419765507f367 Mon Sep 17 00:00:00 2001 From: jacktengg Date: Mon, 28 Mar 2022 19:31:45 +0800 Subject: [PATCH 32/32] rebase to stream-load-vec branch --- be/src/common/config.h | 1 + be/src/exec/tablet_sink.cpp | 27 ++++++++++++----- be/src/exec/tablet_sink.h | 29 ++++++++++--------- be/src/olap/delta_writer.h | 3 +- be/src/olap/memtable.cpp | 4 +-- be/src/olap/olap_define.h | 3 ++ be/src/olap/row_block2.cpp | 2 -- be/src/runtime/load_channel.h | 2 +- be/src/runtime/load_channel_mgr.cpp | 7 ++--- be/src/runtime/load_channel_mgr.h | 3 +- be/src/vec/exec/vbroker_scan_node.cpp | 2 +- be/src/vec/olap/vdelta_writer.cpp | 12 ++++---- be/src/vec/olap/vdelta_writer.h | 6 ++-- be/src/vec/runtime/vload_channel.cpp | 7 ++--- be/src/vec/runtime/vload_channel.h | 5 ++-- be/src/vec/runtime/vload_channel_mgr.cpp | 5 ++-- be/src/vec/runtime/vload_channel_mgr.h | 3 +- be/src/vec/runtime/vtablets_channel.cpp | 6 ++-- be/src/vec/runtime/vtablets_channel.h | 2 +- be/src/vec/sink/vtablet_sink.cpp | 37 +++++++++++++++--------- be/src/vec/sink/vtablet_sink.h | 7 +++-- be/test/vec/olap/vdelta_writer_test.cpp | 6 ++-- 22 files changed, 98 insertions(+), 81 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index f6987e746904b7..f5a3900584a95d 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -731,6 +731,7 @@ CONF_mInt32(string_type_length_soft_limit_bytes, "1048576"); CONF_Validator(string_type_length_soft_limit_bytes, [](const int config) -> bool { return config > 0 && config <= 2147483643; }); + CONF_mBool(enable_vectorized_load, "false"); } // namespace config diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index 56ee842f957058..042f5a2c19f586 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -102,7 +102,7 @@ Status NodeChannel::init(RuntimeState* state) { } if (!_is_vectorized) { - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); // Initialize _cur_add_batch_request _cur_add_batch_request.set_allocated_id(&_parent->_load_id); @@ -197,8 +197,15 @@ Status NodeChannel::open_wait() { // add batch closure _add_batch_closure = ReusableClosure::create(); _add_batch_closure->addFailedHandler([this](bool is_last_rpc) { + std::lock_guard l(this->_closed_lock); + if (this->_is_closed) { + // if the node channel is closed, no need to call `mark_as_failed`, + // and notice that _index_channel may already be destroyed. + return; + } // If rpc failed, mark all tablets on this node channel as failed - _index_channel->mark_as_failed(this->node_id(), this->host(), _add_batch_closure->cntl.ErrorText(), -1); + _index_channel->mark_as_failed(this->node_id(), this->host(), + _add_batch_closure->cntl.ErrorText(), -1); Status st = _index_channel->check_intolerable_failure(); if (!st.ok()) { _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); @@ -211,11 +218,18 @@ Status NodeChannel::open_wait() { _add_batch_closure->addSuccessHandler([this](const PTabletWriterAddBatchResult& result, bool is_last_rpc) { + std::lock_guard l(this->_closed_lock); + if (this->_is_closed) { + // if the node channel is closed, no need to call the following logic, + // and notice that _index_channel may already be destroyed. + return; + } Status status(result.status()); if (status.ok()) { // if has error tablet, handle them first for (auto& error : result.tablet_errors()) { - _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), error.tablet_id()); + _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), + error.tablet_id()); } Status st = _index_channel->check_intolerable_failure(); @@ -722,7 +736,7 @@ Status OlapTableSink::prepare(RuntimeState* state) { } } - _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size(), _mem_tracker.get())); + _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size())); } _max_decimalv2_val.resize(_output_tuple_desc->slots().size()); @@ -791,9 +805,8 @@ Status OlapTableSink::prepare(RuntimeState* state) { } else { index_channel = new IndexChannel(this, index->index_id); } - auto channel = _pool->add(index_channel); - RETURN_IF_ERROR(channel->init(state, tablets)); - _channels.emplace_back(channel); + RETURN_IF_ERROR(index_channel->init(state, tablets)); + _channels.emplace_back(index_channel); } return Status::OK(); diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h index b00d896ff1a2e2..c1d592c40db518 100644 --- a/be/src/exec/tablet_sink.h +++ b/be/src/exec/tablet_sink.h @@ -186,7 +186,7 @@ class NodeChannel { // only allow 1 rpc in flight // plz make sure, this func should be called after open_wait(). virtual int try_send_and_fetch_status(RuntimeState* state, - std::unique_ptr& thread_pool_token); + std::unique_ptr& thread_pool_token); void try_send_batch(RuntimeState* state); @@ -245,6 +245,9 @@ class NodeChannel { int64_t _next_packet_seq = 0; MonotonicStopWatch _timeout_watch; + // the timestamp when this node channel be marked closed and finished closed + uint64_t _close_time_ms = 0; + // user cancel or get some errors std::atomic _cancelled {false}; SpinLock _cancel_msg_lock; @@ -281,19 +284,6 @@ class NodeChannel { std::atomic _queue_push_lock_ns {0}; std::atomic _actual_consume_ns {0}; -private: - // buffer for saving serialized row batch data. - // In the non-attachment approach, we need to use two PRowBatch structures alternately - // so that when one PRowBatch is sent, the other PRowBatch can be used for the serialization of the next RowBatch. - // This is not necessary with the attachment approach, because the memory structures - // are already copied into attachment memory before sending, and will wait for - // the previous RPC to be fully completed before the next copy. - std::string _tuple_data_buffer; - std::string* _tuple_data_buffer_ptr = nullptr; - - // the timestamp when this node channel be marked closed and finished closed - uint64_t _close_time_ms = 0; - // lock to protect _is_closed. // The methods in the IndexChannel are called back in the RpcClosure in the NodeChannel. // However, this rpc callback may occur after the whole task is finished (e.g. due to network latency), @@ -304,6 +294,17 @@ class NodeChannel { // The IndexChannel is definitely accessible until the NodeChannel is closed. std::mutex _closed_lock; bool _is_closed = false; + +private: + // buffer for saving serialized row batch data. + // In the non-attachment approach, we need to use two PRowBatch structures alternately + // so that when one PRowBatch is sent, the other PRowBatch can be used for the serialization of the next RowBatch. + // This is not necessary with the attachment approach, because the memory structures + // are already copied into attachment memory before sending, and will wait for + // the previous RPC to be fully completed before the next copy. + std::string _tuple_data_buffer; + std::string* _tuple_data_buffer_ptr = nullptr; + std::unique_ptr _cur_batch; PTabletWriterAddBatchRequest _cur_add_batch_request; using AddBatchReq = std::pair, PTabletWriterAddBatchRequest>; diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index d2044526d3b59c..9f17beed933452 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -92,12 +92,13 @@ class DeltaWriter { int64_t tablet_id() { return _tablet->tablet_id(); } -private: +protected: DeltaWriter(WriteRequest* req, StorageEngine* storage_engine); // push a full memtable to flush executor OLAPStatus _flush_memtable_async(); +private: void _garbage_collection(); virtual void _reset_mem_table(); diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index ef7c807877d94a..176f4fcbb95490 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -110,7 +110,7 @@ MemTable::~MemTable() { delete row; } } - _mem_tracker->Release(_mem_usage); + _mem_tracker->release(_mem_usage); } MemTable::RowCursorComparator::RowCursorComparator(const Schema* schema) : _schema(schema) {} @@ -145,7 +145,7 @@ void MemTable::insert(const vectorized::Block* block, size_t row_pos, size_t num _input_mutable_block.add_rows(block, row_pos, num_rows); size_t newsize = _input_mutable_block.allocated_bytes(); _mem_usage += newsize - oldsize; - _mem_tracker->Consume(newsize - oldsize); + _mem_tracker->consume(newsize - oldsize); for(int i = 0; i < num_rows; i++){ RowInBlock* row_in_block_ptr = new RowInBlock(cursor_in_mutableblock + i); diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h index 6af5024a339269..b16c614d062016 100644 --- a/be/src/olap/olap_define.h +++ b/be/src/olap/olap_define.h @@ -56,6 +56,9 @@ static const uint16_t OLAP_VARCHAR_MAX_LENGTH = 65535; // the max length supported for string type 2GB static const uint32_t OLAP_STRING_MAX_LENGTH = 2147483647; +// the max length supported for vec string type 1MB +static constexpr size_t MAX_SIZE_OF_VEC_STRING = 1024 * 1024; + // the max length supported for array static const uint16_t OLAP_ARRAY_MAX_LENGTH = 65535; diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp index 4f2feec37bd88c..7ea1fb1f3677a2 100644 --- a/be/src/olap/row_block2.cpp +++ b/be/src/olap/row_block2.cpp @@ -354,8 +354,6 @@ Status RowBlockV2::_copy_data_to_column(int cid, Status RowBlockV2::_append_data_to_column(const ColumnVectorBatch* batch, size_t start, uint32_t len, doris::vectorized::MutableColumnPtr& origin_column) { - constexpr auto MAX_SIZE_OF_VEC_STRING = 1024l * 1024; - auto* column = origin_column.get(); uint32_t selected_size = len; bool nullable_mark_array[selected_size]; diff --git a/be/src/runtime/load_channel.h b/be/src/runtime/load_channel.h index c2ddeaff368869..4b4708679dd697 100644 --- a/be/src/runtime/load_channel.h +++ b/be/src/runtime/load_channel.h @@ -40,7 +40,7 @@ class LoadChannel { public: LoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, bool is_high_priority, const std::string& sender_ip); - ~LoadChannel(); + virtual ~LoadChannel(); // open a new load channel if not exist virtual Status open(const PTabletWriterOpenRequest& request); diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp index c2d5d053f3a390..be57e27907a4cb 100644 --- a/be/src/runtime/load_channel_mgr.cpp +++ b/be/src/runtime/load_channel_mgr.cpp @@ -94,9 +94,8 @@ Status LoadChannelMgr::init(int64_t process_mem_limit) { LoadChannel* LoadChannelMgr::_create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip) { - return new LoadChannel(load_id, mem_limit, timeout_s, mem_tracker, is_high_priority, sender_ip); + bool is_high_priority, const std::string& sender_ip) { + return new LoadChannel(load_id, mem_limit, timeout_s, is_high_priority, sender_ip); } Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { @@ -118,7 +117,7 @@ Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { int64_t job_timeout_s = calc_job_timeout_s(timeout_in_req_s); bool is_high_priority = (params.has_is_high_priority() && params.is_high_priority()); - channel.reset(new LoadChannel(load_id, job_max_memory, job_timeout_s, is_high_priority, + channel.reset(_create_load_channel(load_id, job_max_memory, job_timeout_s, is_high_priority, params.sender_ip())); _load_channels.insert({load_id, channel}); } diff --git a/be/src/runtime/load_channel_mgr.h b/be/src/runtime/load_channel_mgr.h index b8cd673246f38d..2bb982cb087935 100644 --- a/be/src/runtime/load_channel_mgr.h +++ b/be/src/runtime/load_channel_mgr.h @@ -64,8 +64,7 @@ class LoadChannelMgr { protected: virtual LoadChannel* _create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip); + bool is_high_priority, const std::string& sender_ip); template Status _get_load_channel(std::shared_ptr& channel, diff --git a/be/src/vec/exec/vbroker_scan_node.cpp b/be/src/vec/exec/vbroker_scan_node.cpp index bc20bf05e8c78a..9cb919228d0043 100644 --- a/be/src/vec/exec/vbroker_scan_node.cpp +++ b/be/src/vec/exec/vbroker_scan_node.cpp @@ -168,7 +168,7 @@ Status VBrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, // 1. too many batches in queue, or // 2. at least one batch in queue and memory exceed limit. (_block_queue.size() >= _max_buffered_batches || - (mem_tracker()->AnyLimitExceeded(MemLimit::HARD) && !_block_queue.empty()))) { + (mem_tracker()->any_limit_exceeded() && !_block_queue.empty()))) { _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); } // Process already set failed, so we just return OK diff --git a/be/src/vec/olap/vdelta_writer.cpp b/be/src/vec/olap/vdelta_writer.cpp index c4480c9e2577f5..d7002b299405d3 100644 --- a/be/src/vec/olap/vdelta_writer.cpp +++ b/be/src/vec/olap/vdelta_writer.cpp @@ -23,17 +23,15 @@ namespace doris { namespace vectorized { -VDeltaWriter::VDeltaWriter(WriteRequest* req, const std::shared_ptr& parent, - StorageEngine* storage_engine) - : DeltaWriter(req, parent, storage_engine) {} +VDeltaWriter::VDeltaWriter(WriteRequest* req, StorageEngine* storage_engine) + : DeltaWriter(req, storage_engine) {} VDeltaWriter::~VDeltaWriter() { } -OLAPStatus VDeltaWriter::open(WriteRequest* req, const std::shared_ptr& parent, - VDeltaWriter** writer) { - *writer = new VDeltaWriter(req, parent, StorageEngine::instance()); +OLAPStatus VDeltaWriter::open(WriteRequest* req, VDeltaWriter** writer) { + *writer = new VDeltaWriter(req, StorageEngine::instance()); return OLAP_SUCCESS; } @@ -75,7 +73,7 @@ OLAPStatus VDeltaWriter::write_block(const vectorized::Block* block, const std:: void VDeltaWriter::_reset_mem_table() { _mem_table.reset(new MemTable(_tablet->tablet_id(), _schema.get(), _tablet_schema, _req.slots, _req.tuple_desc, _tablet->keys_type(), _rowset_writer.get(), - _mem_tracker, true)); + true)); } } // namespace vectorized diff --git a/be/src/vec/olap/vdelta_writer.h b/be/src/vec/olap/vdelta_writer.h index ac2a6c55568f36..6d737777512aa7 100644 --- a/be/src/vec/olap/vdelta_writer.h +++ b/be/src/vec/olap/vdelta_writer.h @@ -27,8 +27,7 @@ class VDeltaWriter : public DeltaWriter { public: virtual ~VDeltaWriter() override; - static OLAPStatus open(WriteRequest* req, const std::shared_ptr& parent, - VDeltaWriter** writer); + static OLAPStatus open(WriteRequest* req, VDeltaWriter** writer); virtual OLAPStatus write_block(const vectorized::Block* block, const std::vector& row_idxs) override; @@ -36,8 +35,7 @@ class VDeltaWriter : public DeltaWriter { virtual void _reset_mem_table() override; private: - VDeltaWriter(WriteRequest* req, const std::shared_ptr& parent, - StorageEngine* storage_engine); + VDeltaWriter(WriteRequest* req, StorageEngine* storage_engine); }; } // namespace vectorized diff --git a/be/src/vec/runtime/vload_channel.cpp b/be/src/vec/runtime/vload_channel.cpp index efca66a1d9507b..5ac1c7d8f2397e 100644 --- a/be/src/vec/runtime/vload_channel.cpp +++ b/be/src/vec/runtime/vload_channel.cpp @@ -23,9 +23,8 @@ namespace doris { namespace vectorized { VLoadChannel::VLoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip) - : LoadChannel(load_id, mem_limit, timeout_s, mem_tracker, is_high_priority, sender_ip) { + bool is_high_priority, const std::string& sender_ip) + : LoadChannel(load_id, mem_limit, timeout_s, is_high_priority, sender_ip) { } Status VLoadChannel::open(const PTabletWriterOpenRequest& params) { @@ -39,7 +38,7 @@ Status VLoadChannel::open(const PTabletWriterOpenRequest& params) { } else { // create a new tablets channel TabletsChannelKey key(params.id(), index_id); - channel.reset(new VTabletsChannel(key, _mem_tracker, _is_high_priority)); + channel.reset(new VTabletsChannel(key, _is_high_priority)); _tablets_channels.insert({index_id, channel}); } } diff --git a/be/src/vec/runtime/vload_channel.h b/be/src/vec/runtime/vload_channel.h index 9de359236d29f3..411625e9fb3540 100644 --- a/be/src/vec/runtime/vload_channel.h +++ b/be/src/vec/runtime/vload_channel.h @@ -26,10 +26,9 @@ namespace vectorized { class VLoadChannel : public LoadChannel { public: VLoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip); + bool is_high_priority, const std::string& sender_ip); - virtual ~VLoadChannel() override = default; + ~VLoadChannel() override {}; virtual Status open(const PTabletWriterOpenRequest& request) override; diff --git a/be/src/vec/runtime/vload_channel_mgr.cpp b/be/src/vec/runtime/vload_channel_mgr.cpp index daf57c855b83af..aa353515f8b9ac 100644 --- a/be/src/vec/runtime/vload_channel_mgr.cpp +++ b/be/src/vec/runtime/vload_channel_mgr.cpp @@ -28,9 +28,8 @@ VLoadChannelMgr::~VLoadChannelMgr() {} LoadChannel* VLoadChannelMgr::_create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip) { - return new VLoadChannel(load_id, mem_limit, timeout_s, mem_tracker, is_high_priority, sender_ip); + bool is_high_priority, const std::string& sender_ip) { + return new VLoadChannel(load_id, mem_limit, timeout_s, is_high_priority, sender_ip); } Status VLoadChannelMgr::add_block(const PTabletWriterAddBlockRequest& request, diff --git a/be/src/vec/runtime/vload_channel_mgr.h b/be/src/vec/runtime/vload_channel_mgr.h index f0ea13a786e5c2..cbe53336d70619 100644 --- a/be/src/vec/runtime/vload_channel_mgr.h +++ b/be/src/vec/runtime/vload_channel_mgr.h @@ -35,8 +35,7 @@ class VLoadChannelMgr : public LoadChannelMgr { PTabletWriterAddBlockResult* response) override; protected: LoadChannel* _create_load_channel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip) override; + bool is_high_priority, const std::string& sender_ip) override; }; } // namespace vectorized diff --git a/be/src/vec/runtime/vtablets_channel.cpp b/be/src/vec/runtime/vtablets_channel.cpp index 6d37677ef92540..fb3244d821a616 100644 --- a/be/src/vec/runtime/vtablets_channel.cpp +++ b/be/src/vec/runtime/vtablets_channel.cpp @@ -29,9 +29,8 @@ namespace doris { namespace vectorized { VTabletsChannel::VTabletsChannel(const TabletsChannelKey& key, - const std::shared_ptr& mem_tracker, bool is_high_priority) - : TabletsChannel(key, mem_tracker, is_high_priority) {} + : TabletsChannel(key, is_high_priority) {} Status VTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request) { std::vector* index_slots = nullptr; @@ -56,12 +55,11 @@ Status VTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& reques wrequest.txn_id = _txn_id; wrequest.partition_id = tablet.partition_id(); wrequest.load_id = request.id(); - wrequest.need_gen_rollup = request.need_gen_rollup(); wrequest.slots = index_slots; wrequest.is_high_priority = _is_high_priority; VDeltaWriter* writer = nullptr; - auto st = VDeltaWriter::open(&wrequest, _mem_tracker, &writer); + auto st = VDeltaWriter::open(&wrequest, &writer); if (st != OLAP_SUCCESS) { std::stringstream ss; ss << "open delta writer failed, tablet_id=" << tablet.tablet_id() diff --git a/be/src/vec/runtime/vtablets_channel.h b/be/src/vec/runtime/vtablets_channel.h index 1f7d31c1ba0238..45bc652a5eceb2 100644 --- a/be/src/vec/runtime/vtablets_channel.h +++ b/be/src/vec/runtime/vtablets_channel.h @@ -26,7 +26,7 @@ namespace vectorized { class VTabletsChannel : public TabletsChannel { public: - VTabletsChannel(const TabletsChannelKey& key, const std::shared_ptr& mem_tracker, bool is_high_priority); + VTabletsChannel(const TabletsChannelKey& key, bool is_high_priority); virtual Status add_block(const PTabletWriterAddBlockRequest& request, PTabletWriterAddBlockResult* response) override; diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index a3ace9da8abdf8..300ba51b3eb4b8 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -17,6 +17,7 @@ #include "vec/sink/vtablet_sink.h" +#include "runtime/thread_context.h" #include "util/doris_metrics.h" #include "vec/core/block.h" #include "vec/exprs/vexpr.h" @@ -73,8 +74,15 @@ Status VNodeChannel::open_wait() { // add block closure _add_block_closure = ReusableClosure::create(); _add_block_closure->addFailedHandler([this](bool is_last_rpc) { + std::lock_guard l(this->_closed_lock); + if (this->_is_closed) { + // if the node channel is closed, no need to call `mark_as_failed`, + // and notice that _index_channel may already be destroyed. + return; + } // If rpc failed, mark all tablets on this node channel as failed - _index_channel->mark_as_failed(this->node_id(), this->host(), _add_block_closure->cntl.ErrorText(), -1); + _index_channel->mark_as_failed(this->node_id(), this->host(), + _add_block_closure->cntl.ErrorText(), -1); Status st = _index_channel->check_intolerable_failure(); if (!st.ok()) { _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); @@ -87,6 +95,12 @@ Status VNodeChannel::open_wait() { _add_block_closure->addSuccessHandler([this](const PTabletWriterAddBlockResult& result, bool is_last_rpc) { + std::lock_guard l(this->_closed_lock); + if (this->_is_closed) { + // if the node channel is closed, no need to call the following logic, + // and notice that _index_channel may already be destroyed. + return; + } Status status(result.status()); if (status.ok()) { // if has error tablet, handle them first @@ -137,7 +151,7 @@ Status VNodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && + while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); @@ -162,7 +176,8 @@ Status VNodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { return Status::OK(); } -int VNodeChannel::try_send_and_fetch_status(std::unique_ptr& thread_pool_token) { +int VNodeChannel::try_send_and_fetch_status(RuntimeState* state, + std::unique_ptr& thread_pool_token) { auto st = none_of({_cancelled, _send_finished}); if (!st.ok()) { return 0; @@ -170,14 +185,16 @@ int VNodeChannel::try_send_and_fetch_status(std::unique_ptr& th bool is_finished = true; if (!_add_block_closure->is_packet_in_flight() && _pending_batches_num > 0 && _last_patch_processed_finished.compare_exchange_strong(is_finished, false)) { - auto s = thread_pool_token->submit_func(std::bind(&VNodeChannel::try_send_block, this)); + auto s = thread_pool_token->submit_func( + std::bind(&VNodeChannel::try_send_block, this, state)); if (!s.ok()) { _cancel_with_msg("submit send_batch task to send_batch_thread_pool failed"); } } return _send_finished ? 0 : 1; } -void VNodeChannel::try_send_block() { +void VNodeChannel::try_send_block(RuntimeState* state) { + SCOPED_ATTACH_TASK_THREAD(state, _node_channel_tracker); SCOPED_ATOMIC_TIMER(&_actual_consume_ns); AddBlockReq send_block; { @@ -256,15 +273,10 @@ void VNodeChannel::_close_check() { CHECK(_cur_mutable_block == nullptr) << name(); } -Status VNodeChannel::mark_close() { +void VNodeChannel::mark_close() { auto st = none_of({_cancelled, _eos_is_produced}); if (!st.ok()) { - if (_cancelled) { - std::lock_guard l(_cancel_msg_lock); - return Status::InternalError("mark close failed. " + _cancel_msg); - } else { - return st.clone_and_prepend("already stopped, can't mark as closed. cancelled/eos: "); - } + return; } _cur_add_block_request.set_eos(true); @@ -280,7 +292,6 @@ Status VNodeChannel::mark_close() { } _eos_is_produced = true; - return Status::OK(); } VIndexChannel::VIndexChannel(OlapTableSink* parent, int64_t index_id) diff --git a/be/src/vec/sink/vtablet_sink.h b/be/src/vec/sink/vtablet_sink.h index 43528a2e86e7d2..65a90ad5050fcd 100644 --- a/be/src/vec/sink/vtablet_sink.h +++ b/be/src/vec/sink/vtablet_sink.h @@ -41,16 +41,17 @@ class VNodeChannel : public NodeChannel { Status add_row(BlockRow& block_row, int64_t tablet_id) override; - int try_send_and_fetch_status(std::unique_ptr& thread_pool_token) override; + int try_send_and_fetch_status(RuntimeState* state, + std::unique_ptr& thread_pool_token) override; - void try_send_block(); + void try_send_block(RuntimeState* state); void clear_all_blocks() override; // two ways to stop channel: // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response. // 2. just cancel() - Status mark_close() override; + void mark_close() override; protected: void _close_check() override; diff --git a/be/test/vec/olap/vdelta_writer_test.cpp b/be/test/vec/olap/vdelta_writer_test.cpp index cbd837ce43f001..f505c152f79d42 100644 --- a/be/test/vec/olap/vdelta_writer_test.cpp +++ b/be/test/vec/olap/vdelta_writer_test.cpp @@ -369,7 +369,7 @@ TEST_F(VTestDeltaWriter, open) { WriteRequest write_req = {10003, 270068375, WriteType::LOAD, 20001, 30001, load_id, false, tuple_desc}; vectorized::VDeltaWriter* delta_writer = nullptr; - vectorized::VDeltaWriter::open(&write_req, k_mem_tracker, &delta_writer); + vectorized::VDeltaWriter::open(&write_req, &delta_writer); ASSERT_NE(delta_writer, nullptr); res = delta_writer->close(); ASSERT_EQ(OLAP_SUCCESS, res); @@ -403,7 +403,7 @@ TEST_F(VTestDeltaWriter, write) { WriteRequest write_req = {10004, 270068376, WriteType::LOAD, 20002, 30002, load_id, false, tuple_desc, &(tuple_desc->slots())}; vectorized::VDeltaWriter* delta_writer = nullptr; - vectorized::VDeltaWriter::open(&write_req, k_mem_tracker, &delta_writer); + vectorized::VDeltaWriter::open(&write_req, &delta_writer); ASSERT_NE(delta_writer, nullptr); auto tracker = std::make_shared(); @@ -543,7 +543,7 @@ TEST_F(VTestDeltaWriter, sequence_col) { WriteRequest write_req = {10005, 270068377, WriteType::LOAD, 20003, 30003, load_id, false, tuple_desc, &(tuple_desc->slots())}; vectorized::VDeltaWriter* delta_writer = nullptr; - vectorized::VDeltaWriter::open(&write_req, k_mem_tracker, &delta_writer); + vectorized::VDeltaWriter::open(&write_req, &delta_writer); ASSERT_NE(delta_writer, nullptr); MemTracker tracker;