diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index a0eaf570714ebd..2aa7b522afb823 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -515,7 +515,7 @@ void TaskWorkerPool::_alter_tablet(const TAgentTaskRequest& agent_task_req, int6 string process_name; switch (task_type) { case TTaskType::ALTER: - process_name = "alter"; + process_name = "AlterTablet"; break; default: std::string task_name; diff --git a/be/src/common/config.h b/be/src/common/config.h index cc3448928da584..c188d9a4aec64b 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -595,12 +595,28 @@ CONF_Int32(aws_log_level, "3"); // the buffer size when read data from remote storage like s3 CONF_mInt32(remote_storage_read_buffer_mb, "16"); +// Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. +CONF_mBool(use_tc_hook, "true"); + // Default level of MemTracker to show in web page // now MemTracker support two level: -// RELEASE: 0 -// DEBUG: 1 +// OVERVIEW: 0 +// TASK: 1 +// INSTANCE: 2 +// VERBOSE: 3 // the level equal or lower than mem_tracker_level will show in web page -CONF_Int16(mem_tracker_level, "0"); +CONF_mInt16(mem_tracker_level, "0"); + +// The minimum length when TCMalloc Hook consumes/releases MemTracker, consume size +// smaller than this value will continue to accumulate. specified as number of bytes. +// Decreasing this value will increase the frequency of consume/release. +// Increasing this value will cause MemTracker statistics to be inaccurate. +CONF_mInt32(mem_tracker_consume_min_size_bytes, "2097152"); + +// When MemTracker is a negative value, it is considered that a memory leak has occurred, +// but the actual MemTracker records inaccurately will also cause a negative value, +// so this feature is in the experimental stage. +CONF_mBool(memory_leak_detection, "false"); // The version information of the tablet will be stored in the memory // in an adjacency graph data structure. diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 044feda7cf6158..36e4f84e36c03f 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -95,17 +95,6 @@ void Daemon::memory_maintenance_thread() { if (env != nullptr) { BufferPool* buffer_pool = env->buffer_pool(); if (buffer_pool != nullptr) buffer_pool->Maintenance(); - - // The process limit as measured by our trackers may get out of sync with the - // process usage if memory is allocated or freed without updating a MemTracker. - // The metric is refreshed whenever memory is consumed or released via a MemTracker, - // so on a system with queries executing it will be refreshed frequently. However - // if the system is idle, we need to refresh the tracker occasionally since - // untracked memory may be allocated or freed, e.g. by background threads. - if (env->process_mem_tracker() != nullptr && - !env->process_mem_tracker()->is_consumption_metric_null()) { - env->process_mem_tracker()->RefreshConsumptionFromMetric(); - } } } } diff --git a/be/src/exec/aggregation_node.cpp b/be/src/exec/aggregation_node.cpp index 82c8f6c267d35f..51b88830a2230a 100644 --- a/be/src/exec/aggregation_node.cpp +++ b/be/src/exec/aggregation_node.cpp @@ -34,6 +34,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.hpp" +#include "runtime/thread_context.h" #include "runtime/tuple.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -77,6 +78,7 @@ Status AggregationNode::init(const TPlanNode& tnode, RuntimeState* state) { Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); _hash_table_buckets_counter = ADD_COUNTER(runtime_profile(), "BuildBuckets", TUnit::UNIT); @@ -106,7 +108,7 @@ Status AggregationNode::prepare(RuntimeState* state) { RowDescriptor build_row_desc(_intermediate_tuple_desc, false); RETURN_IF_ERROR(Expr::prepare(_build_expr_ctxs, state, build_row_desc, expr_mem_tracker())); - _tuple_pool.reset(new MemPool(mem_tracker().get())); + _tuple_pool.reset(new MemPool()); _agg_fn_ctxs.resize(_aggregate_evaluators.size()); int j = _probe_expr_ctxs.size(); @@ -141,6 +143,7 @@ Status AggregationNode::prepare(RuntimeState* state) { } Status AggregationNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); @@ -153,7 +156,7 @@ Status AggregationNode::open(RuntimeState* state) { RETURN_IF_ERROR(_children[0]->open(state)); - RowBatch batch(_children[0]->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(_children[0]->row_desc(), state->batch_size()); int64_t num_input_rows = 0; int64_t num_agg_rows = 0; @@ -227,6 +230,7 @@ Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* // 3. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return nullptr result // level one aggregation node set `eos = true` return directly + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (UNLIKELY(!_needs_finalize && _singleton_output_tuple != nullptr && child(0)->rows_returned() == 0)) { *eos = true; @@ -288,6 +292,7 @@ Status AggregationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // Iterate through the remaining rows in the hash table and call Serialize/Finalize on // them in order to free any memory allocated by UDAs. Finalize() requires a dst tuple diff --git a/be/src/exec/analytic_eval_node.cpp b/be/src/exec/analytic_eval_node.cpp index df1b4cea275537..ff9cc412673a20 100644 --- a/be/src/exec/analytic_eval_node.cpp +++ b/be/src/exec/analytic_eval_node.cpp @@ -22,6 +22,7 @@ #include "runtime/descriptors.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "udf/udf_internal.h" namespace doris { @@ -141,10 +142,11 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); DCHECK(child(0)->row_desc().is_prefix_of(row_desc())); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _child_tuple_desc = child(0)->row_desc().tuple_descriptors()[0]; - _curr_tuple_pool.reset(new MemPool(mem_tracker().get())); - _prev_tuple_pool.reset(new MemPool(mem_tracker().get())); - _mem_pool.reset(new MemPool(mem_tracker().get())); + _curr_tuple_pool.reset(new MemPool()); + _prev_tuple_pool.reset(new MemPool()); + _mem_pool.reset(new MemPool()); _evaluation_timer = ADD_TIMER(runtime_profile(), "EvaluationTime"); DCHECK_EQ(_result_tuple_desc->slots().size(), _evaluators.size()); @@ -183,6 +185,7 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { } Status AnalyticEvalNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -201,7 +204,7 @@ Status AnalyticEvalNode::open(RuntimeState* state) { "Failed to acquire initial read buffer for analytic function " "evaluation. Reducing query concurrency or increasing the memory limit may " "help this query to complete successfully."); - return mem_tracker()->MemLimitExceeded(state, msg, -1); + RETURN_LIMIT_EXCEEDED(mem_tracker(), state, msg); } DCHECK_EQ(_evaluators.size(), _fn_ctxs.size()); @@ -236,10 +239,8 @@ Status AnalyticEvalNode::open(RuntimeState* state) { // Fetch the first input batch so that some _prev_input_row can be set here to avoid // special casing in GetNext(). - _prev_child_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); - _curr_child_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _prev_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); + _curr_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); while (!_input_eos && _prev_input_row == nullptr) { RETURN_IF_ERROR(child(0)->get_next(state, _curr_child_batch.get(), &_input_eos)); @@ -738,7 +739,7 @@ Status AnalyticEvalNode::get_next_output_batch(RuntimeState* state, RowBatch* ou ExprContext** ctxs = &_conjunct_ctxs[0]; int num_ctxs = _conjunct_ctxs.size(); - RowBatch input_batch(child(0)->row_desc(), output_batch->capacity(), mem_tracker().get()); + RowBatch input_batch(child(0)->row_desc(), output_batch->capacity()); int64_t stream_idx = _input_stream->rows_returned(); RETURN_IF_ERROR(_input_stream->get_next(&input_batch, eos)); @@ -813,6 +814,7 @@ inline int64_t AnalyticEvalNode::num_output_rows_ready() const { } Status AnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); @@ -857,6 +859,7 @@ Status AnalyticEvalNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_input_stream.get() != nullptr) { _input_stream->close(); diff --git a/be/src/exec/assert_num_rows_node.cpp b/be/src/exec/assert_num_rows_node.cpp index 6c84dfc1f05f8d..4f9de8ad77a2d6 100644 --- a/be/src/exec/assert_num_rows_node.cpp +++ b/be/src/exec/assert_num_rows_node.cpp @@ -21,6 +21,7 @@ #include "gutil/strings/substitute.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -48,6 +49,7 @@ Status AssertNumRowsNode::prepare(RuntimeState* state) { } Status AssertNumRowsNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); // ISSUE-3435 @@ -56,6 +58,7 @@ Status AssertNumRowsNode::open(RuntimeState* state) { } Status AssertNumRowsNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); output_batch->reset(); diff --git a/be/src/exec/base_scanner.cpp b/be/src/exec/base_scanner.cpp index f9fb5e389860e5..d5dd7ec5ff2ad7 100644 --- a/be/src/exec/base_scanner.cpp +++ b/be/src/exec/base_scanner.cpp @@ -33,8 +33,7 @@ namespace doris { BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile, const TBrokerScanRangeParams& params, - const std::vector& pre_filter_texprs, - ScannerCounter* counter) + const std::vector& pre_filter_texprs, ScannerCounter* counter) : _state(state), _params(params), _counter(counter), @@ -43,11 +42,10 @@ BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile, #if BE_TEST _mem_tracker(new MemTracker()), #else - _mem_tracker( - MemTracker::CreateTracker(-1, "BaseScanner:" + std::to_string(state->load_job_id()), - state->instance_mem_tracker())), + _mem_tracker(MemTracker::create_tracker( + -1, "Scanner:" + std::to_string(state->load_job_id()))), #endif - _mem_pool(_mem_tracker.get()), + _mem_pool(_mem_tracker), _dest_tuple_desc(nullptr), _pre_filter_texprs(pre_filter_texprs), _strict_mode(false), @@ -259,5 +257,4 @@ void BaseScanner::close() { } } - } // namespace doris diff --git a/be/src/exec/blocking_join_node.cpp b/be/src/exec/blocking_join_node.cpp index ba137860ac7742..e54721dd861c64 100644 --- a/be/src/exec/blocking_join_node.cpp +++ b/be/src/exec/blocking_join_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -45,8 +46,9 @@ BlockingJoinNode::~BlockingJoinNode() { Status BlockingJoinNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _left_child_timer = ADD_TIMER(runtime_profile(), "LeftChildTime"); _build_row_counter = ADD_COUNTER(runtime_profile(), "BuildRows", TUnit::UNIT); @@ -69,7 +71,7 @@ Status BlockingJoinNode::prepare(RuntimeState* state) { _probe_tuple_row_size = num_left_tuples * sizeof(Tuple*); _build_tuple_row_size = num_build_tuples * sizeof(Tuple*); - _left_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _left_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); return Status::OK(); } @@ -82,10 +84,13 @@ Status BlockingJoinNode::close(RuntimeState* state) { } void BlockingJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); status->set_value(construct_build_side(state)); } Status BlockingJoinNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); SCOPED_TIMER(_runtime_profile->total_time_counter()); // RETURN_IF_ERROR(Expr::open(_conjuncts, state)); diff --git a/be/src/exec/broker_scan_node.cpp b/be/src/exec/broker_scan_node.cpp index 11928480d85f3d..adc959e23632f5 100644 --- a/be/src/exec/broker_scan_node.cpp +++ b/be/src/exec/broker_scan_node.cpp @@ -30,6 +30,7 @@ #include "runtime/dpp_sink_internal.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -60,6 +61,7 @@ Status BrokerScanNode::init(const TPlanNode& tnode, RuntimeState* state) { Status BrokerScanNode::prepare(RuntimeState* state) { VLOG_QUERY << "BrokerScanNode prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -86,6 +88,7 @@ Status BrokerScanNode::prepare(RuntimeState* state) { } Status BrokerScanNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -106,6 +109,7 @@ Status BrokerScanNode::start_scanners() { } Status BrokerScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // check if CANCELLED. if (state->is_cancelled()) { @@ -190,6 +194,7 @@ Status BrokerScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); _scan_finished.store(true); @@ -254,7 +259,7 @@ Status BrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, while (!scanner_eof) { // Fill one row batch std::shared_ptr row_batch( - new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker().get())); + new RowBatch(row_desc(), _runtime_state->batch_size())); // create new tuple buffer for row_batch MemPool* tuple_pool = row_batch->tuple_data_pool(); @@ -318,7 +323,7 @@ Status BrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, // 1. too many batches in queue, or // 2. at least one batch in queue and memory exceed limit. (_batch_queue.size() >= _max_buffered_batches || - (mem_tracker()->AnyLimitExceeded(MemLimit::HARD) && !_batch_queue.empty()))) { + (mem_tracker()->any_limit_exceeded() && !_batch_queue.empty()))) { _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); } // Process already set failed, so we just return OK diff --git a/be/src/exec/broker_scanner.cpp b/be/src/exec/broker_scanner.cpp index af44b8f7ea99ba..24fa94fe25753d 100644 --- a/be/src/exec/broker_scanner.cpp +++ b/be/src/exec/broker_scanner.cpp @@ -35,7 +35,6 @@ #include "exprs/expr.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/stream_load/load_stream_mgr.h" #include "runtime/stream_load/stream_load_pipe.h" diff --git a/be/src/exec/broker_scanner.h b/be/src/exec/broker_scanner.h index ca66cd0c44826e..638b6eb99e8d7e 100644 --- a/be/src/exec/broker_scanner.h +++ b/be/src/exec/broker_scanner.h @@ -46,7 +46,6 @@ class ExprContext; class TupleDescriptor; class TupleRow; class RowDescriptor; -class MemTracker; class RuntimeProfile; class StreamLoadPipe; diff --git a/be/src/exec/cross_join_node.cpp b/be/src/exec/cross_join_node.cpp index 8ef9b662f8fb65..4fbbc871a01574 100644 --- a/be/src/exec/cross_join_node.cpp +++ b/be/src/exec/cross_join_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "util/runtime_profile.h" @@ -52,10 +53,10 @@ Status CrossJoinNode::close(RuntimeState* state) { Status CrossJoinNode::construct_build_side(RuntimeState* state) { // Do a full scan of child(1) and store all build row batches. RETURN_IF_ERROR(child(1)->open(state)); - + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Cross join, while getting next from the child 1"); while (true) { - RowBatch* batch = _build_batch_pool->add( - new RowBatch(child(1)->row_desc(), state->batch_size(), mem_tracker().get())); + RowBatch* batch = + _build_batch_pool->add(new RowBatch(child(1)->row_desc(), state->batch_size())); RETURN_IF_CANCELLED(state); // TODO(zhaochun): @@ -63,9 +64,6 @@ Status CrossJoinNode::construct_build_side(RuntimeState* state) { bool eos = false; RETURN_IF_ERROR(child(1)->get_next(state, batch, &eos)); - // to prevent use too many memory - RETURN_IF_LIMIT_EXCEEDED(state, "Cross join, while getting next from the child 1."); - SCOPED_TIMER(_build_timer); _build_batches.add_row_batch(batch); VLOG_ROW << build_list_debug_string(); @@ -86,6 +84,7 @@ void CrossJoinNode::init_get_next(TupleRow* first_left_row) { Status CrossJoinNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { // RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT, state)); RETURN_IF_CANCELLED(state); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); *eos = false; // TOOD(zhaochun) // RETURN_IF_ERROR(state->check_query_state()); diff --git a/be/src/exec/csv_scan_node.cpp b/be/src/exec/csv_scan_node.cpp index e005b81d9002e9..afebe91e5bc282 100644 --- a/be/src/exec/csv_scan_node.cpp +++ b/be/src/exec/csv_scan_node.cpp @@ -29,6 +29,7 @@ #include "olap/utils.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" #include "util/debug_util.h" @@ -128,6 +129,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // add timer _split_check_timer = ADD_TIMER(_runtime_profile, "split check timer"); @@ -195,7 +197,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a csv scanner failed."); } - _tuple_pool.reset(new (std::nothrow) MemPool(state->instance_mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool()); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); } @@ -210,6 +212,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { } Status CsvScanNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << "CsvScanNode::Open"; @@ -232,6 +235,7 @@ Status CsvScanNode::open(RuntimeState* state) { } Status CsvScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "CsvScanNode::GetNext"; if (nullptr == state || nullptr == row_batch || nullptr == eos) { return Status::InternalError("input is nullptr pointer"); @@ -320,6 +324,7 @@ Status CsvScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "CsvScanNode::Close"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); diff --git a/be/src/exec/data_sink.cpp b/be/src/exec/data_sink.cpp index 2a259482251a45..1113d98dcf4782 100644 --- a/be/src/exec/data_sink.cpp +++ b/be/src/exec/data_sink.cpp @@ -181,8 +181,8 @@ Status DataSink::init(const TDataSink& thrift_sink) { Status DataSink::prepare(RuntimeState* state) { _expr_mem_tracker = - MemTracker::CreateTracker(-1, _name + ":Expr:" + std::to_string(state->load_job_id()), - state->instance_mem_tracker()); + MemTracker::create_tracker(-1, _name + ":Expr:" + std::to_string(state->load_job_id()), + state->instance_mem_tracker()); return Status::OK(); } diff --git a/be/src/exec/data_sink.h b/be/src/exec/data_sink.h index fcec10aed8825e..a3d8b4ee1ee712 100644 --- a/be/src/exec/data_sink.h +++ b/be/src/exec/data_sink.h @@ -68,7 +68,6 @@ class DataSink { // It must be okay to call this multiple times. Subsequent calls should // be ignored. virtual Status close(RuntimeState* state, Status exec_status) { - _expr_mem_tracker.reset(); _closed = true; return Status::OK(); } diff --git a/be/src/exec/es/es_scroll_parser.cpp b/be/src/exec/es/es_scroll_parser.cpp index 86cd16a934af7e..cfcc1ff2a0db3a 100644 --- a/be/src/exec/es/es_scroll_parser.cpp +++ b/be/src/exec/es/es_scroll_parser.cpp @@ -355,11 +355,12 @@ Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc, Tuple* tuple, // obj[FIELD_ID] must not be nullptr std::string _id = obj[FIELD_ID].GetString(); size_t len = _id.length(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(len)); + Status rst; + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(len, &rst)); if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", len, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(nullptr, details, len); + RETURN_ALLOC_LIMIT_EXCEEDED(tuple_pool->mem_tracker(), nullptr, details, len, rst); } memcpy(buffer, _id.data(), len); reinterpret_cast(slot)->ptr = buffer; @@ -413,11 +414,12 @@ Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc, Tuple* tuple, } } size_t val_size = val.length(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); + Status rst; + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size, &rst)); if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute( ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(nullptr, details, val_size); + RETURN_ALLOC_LIMIT_EXCEEDED(tuple_pool->mem_tracker(), nullptr, details, val_size, rst); } memcpy(buffer, val.data(), val_size); reinterpret_cast(slot)->ptr = buffer; diff --git a/be/src/exec/es_http_scan_node.cpp b/be/src/exec/es_http_scan_node.cpp index 7b67486401be2f..42a122e8988bde 100644 --- a/be/src/exec/es_http_scan_node.cpp +++ b/be/src/exec/es_http_scan_node.cpp @@ -30,6 +30,7 @@ #include "runtime/dpp_sink_internal.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/runtime_profile.h" @@ -67,6 +68,7 @@ Status EsHttpScanNode::init(const TPlanNode& tnode, RuntimeState* state) { Status EsHttpScanNode::prepare(RuntimeState* state) { VLOG_QUERY << "EsHttpScanNode prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -113,6 +115,7 @@ Status EsHttpScanNode::build_conjuncts_list() { } Status EsHttpScanNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -181,6 +184,7 @@ Status EsHttpScanNode::collect_scanners_status() { } Status EsHttpScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (state->is_cancelled()) { std::unique_lock l(_batch_queue_lock); @@ -268,6 +272,7 @@ Status EsHttpScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); _scan_finished.store(true); @@ -307,8 +312,7 @@ Status EsHttpScanNode::scanner_scan(std::unique_ptr scanner, while (!scanner_eof) { // Fill one row batch - std::shared_ptr row_batch( - new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker().get())); + std::shared_ptr row_batch(new RowBatch(row_desc(), _runtime_state->batch_size())); // create new tuple buffer for row_batch MemPool* tuple_pool = row_batch->tuple_data_pool(); @@ -406,6 +410,9 @@ static std::string get_host_port(const std::vector& es_hosts) { } void EsHttpScanNode::scanner_worker(int start_idx, int length, std::promise& p_status) { + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), + print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), mem_tracker()); // Clone expr context std::vector scanner_expr_ctxs; DCHECK(start_idx < length); diff --git a/be/src/exec/es_http_scanner.cpp b/be/src/exec/es_http_scanner.cpp index fe3d67b80daf2d..545c4699e1ade2 100644 --- a/be/src/exec/es_http_scanner.cpp +++ b/be/src/exec/es_http_scanner.cpp @@ -24,7 +24,6 @@ #include "exprs/expr_context.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/runtime_state.h" #include "runtime/tuple.h" @@ -43,14 +42,7 @@ EsHttpScanner::EsHttpScanner(RuntimeState* state, RuntimeProfile* profile, Tuple _next_range(0), _line_eof(false), _batch_eof(false), -#if BE_TEST - _mem_tracker(new MemTracker()), -#else - _mem_tracker( - MemTracker::CreateTracker(-1, "EsHttpScanner:" + std::to_string(state->load_job_id()), - state->instance_mem_tracker())), -#endif - _mem_pool(_mem_tracker.get()), + _mem_pool("EsHttpScanner"), _tuple_desc(nullptr), _counter(counter), _es_reader(nullptr), diff --git a/be/src/exec/es_http_scanner.h b/be/src/exec/es_http_scanner.h index dcebfe164994b1..cc2380607d25f6 100644 --- a/be/src/exec/es_http_scanner.h +++ b/be/src/exec/es_http_scanner.h @@ -43,7 +43,6 @@ class TextConverter; class TupleDescriptor; class TupleRow; class RowDescriptor; -class MemTracker; class RuntimeProfile; struct EsScanCounter { @@ -82,7 +81,6 @@ class EsHttpScanner { std::vector _slot_descs; std::unique_ptr _row_desc; - std::shared_ptr _mem_tracker; MemPool _mem_pool; const TupleDescriptor* _tuple_desc; diff --git a/be/src/exec/es_scan_node.cpp b/be/src/exec/es_scan_node.cpp index fad266993beb74..a50e1d905eb45f 100644 --- a/be/src/exec/es_scan_node.cpp +++ b/be/src/exec/es_scan_node.cpp @@ -34,6 +34,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "service/backend_options.h" #include "util/debug_util.h" @@ -67,6 +68,7 @@ Status EsScanNode::prepare(RuntimeState* state) { VLOG_CRITICAL << "EsScanNode::Prepare"; RETURN_IF_ERROR(ScanNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); if (_tuple_desc == nullptr) { std::stringstream ss; @@ -80,6 +82,7 @@ Status EsScanNode::prepare(RuntimeState* state) { } Status EsScanNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "EsScanNode::Open"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); @@ -204,6 +207,7 @@ Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // create tuple @@ -256,6 +260,7 @@ Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) Status EsScanNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "EsScanNode::Close"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -771,11 +776,12 @@ Status EsScanNode::materialize_row(MemPool* tuple_pool, Tuple* tuple, } const string& val = col.string_vals[val_idx]; size_t val_size = val.size(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); + Status rst; + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size, &rst)); if (UNLIKELY(buffer == nullptr)) { std::string details = strings::Substitute( ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(nullptr, details, val_size); + RETURN_ALLOC_LIMIT_EXCEEDED(tuple_pool->mem_tracker(), nullptr, details, val_size, rst); } memcpy(buffer, val.data(), val_size); reinterpret_cast(slot)->ptr = buffer; diff --git a/be/src/exec/except_node.cpp b/be/src/exec/except_node.cpp index 8229b73e53a669..f3fc6695f078f7 100644 --- a/be/src/exec/except_node.cpp +++ b/be/src/exec/except_node.cpp @@ -21,6 +21,7 @@ #include "exprs/expr.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { ExceptNode::ExceptNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) @@ -40,6 +41,7 @@ Status ExceptNode::init(const TPlanNode& tnode, RuntimeState* state) { Status ExceptNode::open(RuntimeState* state) { RETURN_IF_ERROR(SetOperationNode::open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "Except , while probing the hash table."); // if a table is empty, the result must be empty if (_hash_tbl->size() == 0) { _hash_tbl_iterator = _hash_tbl->begin(); @@ -53,15 +55,13 @@ Status ExceptNode::open(RuntimeState* state) { if (i > 1) { refresh_hash_table(i); } // probe - _probe_batch.reset( - new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker().get())); + _probe_batch.reset(new RowBatch(child(i)->row_desc(), state->batch_size())); ScopedTimer probe_timer(_probe_timer); RETURN_IF_ERROR(child(i)->open(state)); eos = false; while (!eos) { RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(child(i)->get_next(state, _probe_batch.get(), &eos)); - RETURN_IF_LIMIT_EXCEEDED(state, " Except , while probing the hash table."); for (int j = 0; j < _probe_batch->num_rows(); ++j) { VLOG_ROW << "probe row: " << get_row_output_string(_probe_batch->get_row(j), child(i)->row_desc()); @@ -90,6 +90,7 @@ Status ExceptNode::open(RuntimeState* state) { Status ExceptNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); *eos = true; if (reached_limit()) { diff --git a/be/src/exec/exchange_node.cpp b/be/src/exec/exchange_node.cpp index 14299c65a610ae..759246c55bf7bc 100644 --- a/be/src/exec/exchange_node.cpp +++ b/be/src/exec/exchange_node.cpp @@ -23,6 +23,7 @@ #include "runtime/exec_env.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -57,6 +58,7 @@ Status ExchangeNode::init(const TPlanNode& tnode, RuntimeState* state) { Status ExchangeNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _convert_row_batch_timer = ADD_TIMER(runtime_profile(), "ConvertRowBatchTime"); // TODO: figure out appropriate buffer size DCHECK_GT(_num_senders, 0); @@ -74,6 +76,7 @@ Status ExchangeNode::prepare(RuntimeState* state) { } Status ExchangeNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); if (_is_merging) { @@ -82,8 +85,7 @@ Status ExchangeNode::open(RuntimeState* state) { // create_merger() will populate its merging heap with batches from the _stream_recvr, // so it is not necessary to call fill_input_row_batch(). if (state->enable_exchange_node_parallel_merge()) { - RETURN_IF_ERROR(_stream_recvr->create_parallel_merger(less_than, state->batch_size(), - mem_tracker().get())); + RETURN_IF_ERROR(_stream_recvr->create_parallel_merger(less_than, state->batch_size())); } else { RETURN_IF_ERROR(_stream_recvr->create_merger(less_than)); } @@ -103,6 +105,7 @@ Status ExchangeNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_is_merging) { _sort_exec_exprs.close(state); } @@ -129,6 +132,7 @@ Status ExchangeNode::fill_input_row_batch(RuntimeState* state) { Status ExchangeNode::get_next(RuntimeState* state, RowBatch* output_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit()) { diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index 4501d423430bb5..f65dacc5f1ae72 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -203,12 +203,11 @@ Status ExecNode::prepare(RuntimeState* state) { std::bind(&RuntimeProfile::units_per_second, _rows_returned_counter, runtime_profile()->total_time_counter()), ""); - _mem_tracker = MemTracker::CreateTracker(_runtime_profile.get(), -1, - "ExecNode:" + _runtime_profile->name(), - state->instance_mem_tracker()); - _expr_mem_tracker = MemTracker::CreateTracker(-1, "ExecNode:Exprs:" + _runtime_profile->name(), - _mem_tracker); - _expr_mem_pool.reset(new MemPool(_expr_mem_tracker.get())); + _mem_tracker = MemTracker::create_tracker(-1, "ExecNode:" + _runtime_profile->name(), + state->instance_mem_tracker(), + MemTrackerLevel::VERBOSE, _runtime_profile.get()); + _expr_mem_tracker = MemTracker::create_tracker(-1, "ExecNode:Exprs:" + _runtime_profile->name(), + _mem_tracker); if (_vconjunct_ctx_ptr) { RETURN_IF_ERROR((*_vconjunct_ctx_ptr)->prepare(state, row_desc(), expr_mem_tracker())); @@ -270,10 +269,6 @@ Status ExecNode::close(RuntimeState* state) { if (_vconjunct_ctx_ptr) (*_vconjunct_ctx_ptr)->close(state); Expr::close(_conjunct_ctxs, state); - if (expr_mem_pool() != nullptr) { - _expr_mem_pool->free_all(); - } - if (_buffer_pool_client.is_registered()) { VLOG_FILE << _id << " returning reservation " << _resource_profile.min_reservation; state->initial_reservations()->Return(&_buffer_pool_client, diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h index 7cad50018d848a..1644ba5165db3c 100644 --- a/be/src/exec/exec_node.h +++ b/be/src/exec/exec_node.h @@ -196,8 +196,6 @@ class ExecNode { std::shared_ptr expr_mem_tracker() const { return _expr_mem_tracker; } - MemPool* expr_mem_pool() const { return _expr_mem_pool.get(); } - // Extract node id from p->name(). static int get_node_id_from_profile(RuntimeProfile* p); @@ -306,14 +304,9 @@ class ExecNode { /// Account for peak memory used by this node std::shared_ptr _mem_tracker; - - /// MemTracker used by 'expr_mem_pool_'. + // MemTracker used by all Expr. std::shared_ptr _expr_mem_tracker; - /// MemPool for allocating data structures used by expression evaluators in this node. - /// Created in Prepare(). - std::unique_ptr _expr_mem_pool; - RuntimeProfile::Counter* _rows_returned_counter; RuntimeProfile::Counter* _rows_returned_rate; // Account for peak memory used by this node @@ -377,25 +370,6 @@ class ExecNode { bool _is_closed; }; -#define LIMIT_EXCEEDED(tracker, state, msg) \ - do { \ - stringstream str; \ - str << "Memory exceed limit. " << msg << " "; \ - str << "Backend: " << BackendOptions::get_localhost() << ", "; \ - str << "fragment: " << print_id(state->fragment_instance_id()) << " "; \ - str << "Used: " << tracker->consumption() << ", Limit: " << tracker->limit() << ". "; \ - str << "You can change the limit by session variable exec_mem_limit."; \ - return Status::MemoryLimitExceeded(str.str()); \ - } while (false) - -#define RETURN_IF_LIMIT_EXCEEDED(state, msg) \ - do { \ - /* if (UNLIKELY(MemTracker::limit_exceeded(*(state)->mem_trackers()))) { */ \ - MemTracker* tracker = state->instance_mem_tracker()->find_limit_exceeded_tracker(); \ - if (tracker != nullptr) { \ - LIMIT_EXCEEDED(tracker, state, msg); \ - } \ - } while (false) } // namespace doris #endif diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index f7e8bbcc649858..16c42166e5db62 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -30,6 +30,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_filter_mgr.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "util/runtime_profile.h" @@ -95,8 +96,9 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status HashJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _push_down_timer = ADD_TIMER(runtime_profile(), "PushDownTime"); _push_compute_timer = ADD_TIMER(runtime_profile(), "PushDownComputeTime"); @@ -146,8 +148,7 @@ Status HashJoinNode::prepare(RuntimeState* state) { _hash_tbl.reset(new HashTable(_build_expr_ctxs, _probe_expr_ctxs, _build_tuple_size, stores_nulls, _is_null_safe_eq_join, id(), mem_tracker(), 1024)); - _probe_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _probe_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); return Status::OK(); } @@ -156,6 +157,7 @@ Status HashJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); // Must reset _probe_batch in close() to release resources @@ -176,6 +178,8 @@ Status HashJoinNode::close(RuntimeState* state) { } void HashJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); status->set_value(construct_hash_table(state)); } @@ -184,7 +188,8 @@ Status HashJoinNode::construct_hash_table(RuntimeState* state) { // The hash join node needs to keep in memory all build tuples, including the tuple // row ptrs. The row ptrs are copied into the hash table's internal structure so they // don't need to be stored in the _build_pool. - RowBatch build_batch(child(1)->row_desc(), state->batch_size(), mem_tracker().get()); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Hash join, while constructing the hash table."); + RowBatch build_batch(child(1)->row_desc(), state->batch_size()); RETURN_IF_ERROR(child(1)->open(state)); SCOPED_TIMER(_build_timer); @@ -214,6 +219,7 @@ Status HashJoinNode::construct_hash_table(RuntimeState* state) { } Status HashJoinNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -231,7 +237,8 @@ Status HashJoinNode::open(RuntimeState* state) { // main thread std::promise thread_status; add_runtime_exec_option("Hash Table Built Asynchronously"); - std::thread(bind(&HashJoinNode::build_side_thread, this, state, &thread_status)).detach(); + std::thread(bind(&HashJoinNode::build_side_thread, this, state, &thread_status)) + .detach(); if (!_runtime_filter_descs.empty()) { RuntimeFilterSlots runtime_filter_slots(_probe_expr_ctxs, _build_expr_ctxs, @@ -301,7 +308,7 @@ Status HashJoinNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eo // In most cases, no additional memory overhead will be applied for at this stage, // but if the expression calculation in this node needs to apply for additional memory, // it may cause the memory to exceed the limit. - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while execute get_next."); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "Hash join, while execute get_next."); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit()) { @@ -770,11 +777,9 @@ Status HashJoinNode::process_build_batch(RuntimeState* state, RowBatch* build_ba _build_pool.get(), false); } } - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table."); } else { // take ownership of tuple data of build_batch _build_pool->acquire_data(build_batch->tuple_data_pool(), false); - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table."); for (int i = 0; i < build_batch->num_rows(); ++i) { _hash_tbl->insert(build_batch->get_row(i)); diff --git a/be/src/exec/hash_join_node_ir.cpp b/be/src/exec/hash_join_node_ir.cpp new file mode 100644 index 00000000000000..6dc5ab1dc01c35 --- /dev/null +++ b/be/src/exec/hash_join_node_ir.cpp @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "common/utils.h" +#include "exec/hash_join_node.h" +#include "exec/hash_table.hpp" +#include "exprs/expr_context.h" +#include "runtime/row_batch.h" +#include "runtime/runtime_state.h" +#include "runtime/tuple_row.h" + +namespace doris { + +// Functions in this file are cross compiled to IR with clang. + +// Wrapper around ExecNode's eval conjuncts with a different function name. +// This lets us distinguish between the join conjuncts vs. non-join conjuncts +// for codegen. +// Note: don't declare this static. LLVM will pick the fastcc calling convention and +// we will not be able to replace the functions with codegen'd versions. +// TODO: explicitly set the calling convention? +// TODO: investigate using fastcc for all codegen internal functions? +bool IR_NO_INLINE eval_other_join_conjuncts(ExprContext* const* ctxs, int num_ctxs, TupleRow* row) { + return ExecNode::eval_conjuncts(ctxs, num_ctxs, row); +} + +// CreateOutputRow, EvalOtherJoinConjuncts, and EvalConjuncts are replaced by +// codegen. +int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch, + int max_added_rows) { + // This path does not handle full outer or right outer joins + DCHECK(!_match_all_build); + + int row_idx = out_batch->add_rows(max_added_rows); + DCHECK(row_idx != RowBatch::INVALID_ROW_INDEX); + uint8_t* out_row_mem = reinterpret_cast(out_batch->get_row(row_idx)); + TupleRow* out_row = reinterpret_cast(out_row_mem); + + int rows_returned = 0; + int probe_rows = probe_batch->num_rows(); + + ExprContext* const* other_conjunct_ctxs = &_other_join_conjunct_ctxs[0]; + int num_other_conjunct_ctxs = _other_join_conjunct_ctxs.size(); + + ExprContext* const* conjunct_ctxs = &_conjunct_ctxs[0]; + int num_conjunct_ctxs = _conjunct_ctxs.size(); + + while (true) { + // Create output row for each matching build row + while (_hash_tbl_iterator.has_next()) { + TupleRow* matched_build_row = _hash_tbl_iterator.get_row(); + _hash_tbl_iterator.next(); + create_output_row(out_row, _current_probe_row, matched_build_row); + + if (!eval_other_join_conjuncts(other_conjunct_ctxs, num_other_conjunct_ctxs, out_row)) { + continue; + } + + _matched_probe = true; + + // left_anti_join: equal match won't return + if (_join_op == TJoinOp::LEFT_ANTI_JOIN) { + _hash_tbl_iterator = _hash_tbl->end(); + break; + } + + if (eval_conjuncts(conjunct_ctxs, num_conjunct_ctxs, out_row)) { + ++rows_returned; + + // Filled up out batch or hit limit + if (UNLIKELY(rows_returned == max_added_rows)) { + goto end; + } + + // Advance to next out row + out_row_mem += out_batch->row_byte_size(); + out_row = reinterpret_cast(out_row_mem); + } + + // Handle left semi-join + if (_match_one_build) { + _hash_tbl_iterator = _hash_tbl->end(); + break; + } + } + + // Handle left outer-join and left semi-join + if ((!_matched_probe && _match_all_probe) || + ((!_matched_probe && _join_op == TJoinOp::LEFT_ANTI_JOIN))) { + create_output_row(out_row, _current_probe_row, nullptr); + _matched_probe = true; + + if (ExecNode::eval_conjuncts(conjunct_ctxs, num_conjunct_ctxs, out_row)) { + ++rows_returned; + + if (UNLIKELY(rows_returned == max_added_rows)) { + goto end; + } + + // Advance to next out row + out_row_mem += out_batch->row_byte_size(); + out_row = reinterpret_cast(out_row_mem); + } + } + + if (!_hash_tbl_iterator.has_next()) { + // Advance to the next probe row + if (UNLIKELY(_probe_batch_pos == probe_rows)) { + goto end; + } + if (++_probe_counter % RELEASE_CONTEXT_COUNTER == 0) { + ExprContext::free_local_allocations(_probe_expr_ctxs); + ExprContext::free_local_allocations(_build_expr_ctxs); + } + _current_probe_row = probe_batch->get_row(_probe_batch_pos++); + _hash_tbl_iterator = _hash_tbl->find(_current_probe_row); + _matched_probe = false; + } + } + +end: + + if (_match_one_build && _matched_probe) { + _hash_tbl_iterator = _hash_tbl->end(); + } + + out_batch->commit_rows(rows_returned); + return rows_returned; +} + +// when build table has too many duplicated rows, the collisions will be very serious, +// so in some case will don't need to store duplicated value in hash table, we can build an unique one +Status HashJoinNode::process_build_batch(RuntimeState* state, RowBatch* build_batch) { + // insert build row into our hash table + if (_build_unique) { + for (int i = 0; i < build_batch->num_rows(); ++i) { + // _hash_tbl->insert_unique(build_batch->get_row(i)); + TupleRow* tuple_row = nullptr; + if (_hash_tbl->emplace_key(build_batch->get_row(i), &tuple_row)) { + build_batch->get_row(i)->deep_copy(tuple_row, + child(1)->row_desc().tuple_descriptors(), + _build_pool.get(), false); + } + } + } else { + // take ownership of tuple data of build_batch + _build_pool->acquire_data(build_batch->tuple_data_pool(), false); + + for (int i = 0; i < build_batch->num_rows(); ++i) { + _hash_tbl->insert(build_batch->get_row(i)); + } + } + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/exec/hash_table.cpp b/be/src/exec/hash_table.cpp index 36c3d7b76eaa9c..022bbc301af20b 100644 --- a/be/src/exec/hash_table.cpp +++ b/be/src/exec/hash_table.cpp @@ -44,16 +44,16 @@ HashTable::HashTable(const std::vector& build_expr_ctxs, _current_capacity(num_buckets), _current_used(0), _total_capacity(num_buckets), - _exceeded_limit(false), - _mem_tracker(mem_tracker) { - DCHECK(_mem_tracker); + _exceeded_limit(false) { DCHECK_EQ(_build_expr_ctxs.size(), _probe_expr_ctxs.size()); DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2"; + _mem_tracker = MemTracker::create_virtual_tracker(-1, mem_tracker->label() + "HashTable", + mem_tracker); _buckets.resize(num_buckets); _num_buckets = num_buckets; _num_buckets_till_resize = MAX_BUCKET_OCCUPANCY_FRACTION * _num_buckets; - _mem_tracker->Consume(_buckets.capacity() * sizeof(Bucket)); + _mem_tracker->consume(_buckets.capacity() * sizeof(Bucket)); // Compute the layout and buffer size to store the evaluated expr results _results_buffer_size = Expr::compute_results_layout( @@ -70,7 +70,7 @@ HashTable::HashTable(const std::vector& build_expr_ctxs, _alloc_list.push_back(_current_nodes); _end_list.push_back(_current_nodes + _current_capacity * _node_byte_size); - _mem_tracker->Consume(_current_capacity * _node_byte_size); + _mem_tracker->consume(_current_capacity * _node_byte_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(_current_capacity * _node_byte_size); } @@ -85,8 +85,8 @@ void HashTable::close() { for (auto ptr : _alloc_list) { free(ptr); } - _mem_tracker->Release(_total_capacity * _node_byte_size); - _mem_tracker->Release(_buckets.size() * sizeof(Bucket)); + _mem_tracker->release(_total_capacity * _node_byte_size); + _mem_tracker->release(_buckets.size() * sizeof(Bucket)); } bool HashTable::eval_row(TupleRow* row, const std::vector& ctxs) { @@ -180,7 +180,7 @@ Status HashTable::resize_buckets(int64_t num_buckets) { int64_t old_num_buckets = _num_buckets; int64_t delta_bytes = (num_buckets - old_num_buckets) * sizeof(Bucket); - Status st = _mem_tracker->TryConsume(delta_bytes); + Status st = _mem_tracker->try_consume(delta_bytes); if (!st) { LOG_EVERY_N(WARNING, 100) << "resize bucket failed: " << st.to_string(); mem_limit_exceeded(delta_bytes); @@ -244,7 +244,7 @@ void HashTable::grow_node_array() { _alloc_list.push_back(_current_nodes); _end_list.push_back(_current_nodes + alloc_size); - _mem_tracker->Consume(alloc_size); + _mem_tracker->consume(alloc_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(alloc_size); } diff --git a/be/src/exec/intersect_node.cpp b/be/src/exec/intersect_node.cpp index 60481cce861634..98660740a4d3f2 100644 --- a/be/src/exec/intersect_node.cpp +++ b/be/src/exec/intersect_node.cpp @@ -21,6 +21,7 @@ #include "exprs/expr.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { IntersectNode::IntersectNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) @@ -43,6 +44,7 @@ Status IntersectNode::init(const TPlanNode& tnode, RuntimeState* state) { // 2 probe with child(1), then filter the hash table and find the matched item, use them to rebuild a hash table // repeat [2] this for all the rest child Status IntersectNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "Intersect , while probing the hash table."); RETURN_IF_ERROR(SetOperationNode::open(state)); // if a table is empty, the result must be empty if (_hash_tbl->size() == 0) { @@ -57,14 +59,13 @@ Status IntersectNode::open(RuntimeState* state) { _valid_element_in_hash_tbl = 0; // probe _probe_batch.reset( - new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker().get())); + new RowBatch(child(i)->row_desc(), state->batch_size())); ScopedTimer probe_timer(_probe_timer); RETURN_IF_ERROR(child(i)->open(state)); eos = false; while (!eos) { RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(child(i)->get_next(state, _probe_batch.get(), &eos)); - RETURN_IF_LIMIT_EXCEEDED(state, " Intersect , while probing the hash table."); for (int j = 0; j < _probe_batch->num_rows(); ++j) { VLOG_ROW << "probe row: " << get_row_output_string(_probe_batch->get_row(j), child(i)->row_desc()); @@ -87,6 +88,7 @@ Status IntersectNode::open(RuntimeState* state) { } Status IntersectNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/json_scanner.cpp b/be/src/exec/json_scanner.cpp index eaff43f69a7ed2..3d611e668187d9 100644 --- a/be/src/exec/json_scanner.cpp +++ b/be/src/exec/json_scanner.cpp @@ -30,7 +30,6 @@ #include "exprs/json_functions.h" #include "gutil/strings/split.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" namespace doris { diff --git a/be/src/exec/json_scanner.h b/be/src/exec/json_scanner.h index 1a489a5ed890d3..91528c8351b927 100644 --- a/be/src/exec/json_scanner.h +++ b/be/src/exec/json_scanner.h @@ -47,7 +47,6 @@ class Tuple; class SlotDescriptor; class RuntimeState; class TupleDescriptor; -class MemTracker; class JsonReader; class LineReader; class FileReader; diff --git a/be/src/exec/merge_join_node.cpp b/be/src/exec/merge_join_node.cpp index d83e872507e40a..bf6e78ca16677f 100644 --- a/be/src/exec/merge_join_node.cpp +++ b/be/src/exec/merge_join_node.cpp @@ -25,6 +25,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "util/runtime_profile.h" @@ -71,6 +72,7 @@ Status MergeJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status MergeJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // build and probe exprs are evaluated in the context of the rows produced by our // right and left children, respectively @@ -129,10 +131,8 @@ Status MergeJoinNode::prepare(RuntimeState* state) { _right_tuple_idx.push_back(_row_descriptor.get_tuple_idx(right_tuple_desc->id())); } - _left_child_ctx.reset( - new ChildReaderContext(row_desc(), state->batch_size(), state->instance_mem_tracker())); - _right_child_ctx.reset( - new ChildReaderContext(row_desc(), state->batch_size(), state->instance_mem_tracker())); + _left_child_ctx.reset(new ChildReaderContext(row_desc(), state->batch_size())); + _right_child_ctx.reset(new ChildReaderContext(row_desc(), state->batch_size())); return Status::OK(); } @@ -141,6 +141,7 @@ Status MergeJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); Expr::close(_left_expr_ctxs, state); Expr::close(_right_expr_ctxs, state); @@ -149,6 +150,7 @@ Status MergeJoinNode::close(RuntimeState* state) { } Status MergeJoinNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(ExecNode::open(state)); @@ -170,6 +172,7 @@ Status MergeJoinNode::open(RuntimeState* state) { } Status MergeJoinNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -295,14 +298,12 @@ Status MergeJoinNode::get_input_row(RuntimeState* state, int child_idx) { } if (child_idx == 0) { - _left_child_ctx.reset(new ChildReaderContext(child(child_idx)->row_desc(), - state->batch_size(), - state->instance_mem_tracker())); + _left_child_ctx.reset( + new ChildReaderContext(child(child_idx)->row_desc(), state->batch_size())); ctx = _left_child_ctx.get(); } else { - _right_child_ctx.reset(new ChildReaderContext(child(child_idx)->row_desc(), - state->batch_size(), - state->instance_mem_tracker())); + _right_child_ctx.reset( + new ChildReaderContext(child(child_idx)->row_desc(), state->batch_size())); ctx = _right_child_ctx.get(); } diff --git a/be/src/exec/merge_join_node.h b/be/src/exec/merge_join_node.h index d8b294ea186f1c..ef02727be576fd 100644 --- a/be/src/exec/merge_join_node.h +++ b/be/src/exec/merge_join_node.h @@ -65,9 +65,8 @@ class MergeJoinNode : public ExecNode { int row_idx; bool is_eos; TupleRow* current_row; - ChildReaderContext(const RowDescriptor& desc, int batch_size, - const std::shared_ptr& mem_tracker) - : batch(desc, batch_size, mem_tracker.get()), + ChildReaderContext(const RowDescriptor& desc, int batch_size) + : batch(desc, batch_size), row_idx(0), is_eos(false), current_row(nullptr) {} diff --git a/be/src/exec/merge_node.cpp b/be/src/exec/merge_node.cpp index 513284e4ca6f78..b2a12b33862604 100644 --- a/be/src/exec/merge_node.cpp +++ b/be/src/exec/merge_node.cpp @@ -23,6 +23,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" using std::vector; @@ -60,6 +61,7 @@ Status MergeNode::init(const TPlanNode& tnode, RuntimeState* state) { Status MergeNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); @@ -90,6 +92,7 @@ Status MergeNode::prepare(RuntimeState* state) { } Status MergeNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); // Prepare const expr lists. for (int i = 0; i < _const_result_expr_ctx_lists.size(); ++i) { @@ -105,6 +108,7 @@ Status MergeNode::open(RuntimeState* state) { } Status MergeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -136,8 +140,8 @@ Status MergeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) // Row batch was either never set or we're moving on to a different child. if (_child_row_batch.get() == nullptr) { RETURN_IF_CANCELLED(state); - _child_row_batch.reset(new RowBatch(child(_child_idx)->row_desc(), state->batch_size(), - mem_tracker().get())); + _child_row_batch.reset( + new RowBatch(child(_child_idx)->row_desc(), state->batch_size())); // Open child and fetch the first row batch. RETURN_IF_ERROR(child(_child_idx)->open(state)); RETURN_IF_ERROR( @@ -185,6 +189,7 @@ Status MergeNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // don't call ExecNode::close(), it always closes all children _child_row_batch.reset(nullptr); for (int i = 0; i < _const_result_expr_ctx_lists.size(); ++i) { diff --git a/be/src/exec/mysql_scan_node.cpp b/be/src/exec/mysql_scan_node.cpp index 634f47c7842613..349fd3750da901 100644 --- a/be/src/exec/mysql_scan_node.cpp +++ b/be/src/exec/mysql_scan_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -53,6 +54,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -81,7 +83,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a mysql scanner failed."); } - _tuple_pool.reset(new (std::nothrow) MemPool(mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool("MysqlScanNode")); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); @@ -99,6 +101,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { } Status MysqlScanNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << "MysqlScanNode::Open"; @@ -146,6 +149,7 @@ Status MysqlScanNode::write_text_slot(char* value, int value_length, SlotDescrip } Status MysqlScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "MysqlScanNode::GetNext"; if (nullptr == state || nullptr == row_batch || nullptr == eos) { @@ -241,6 +245,7 @@ Status MysqlScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/odbc_scan_node.cpp b/be/src/exec/odbc_scan_node.cpp index 958e22ef54a166..054dc825f213b6 100644 --- a/be/src/exec/odbc_scan_node.cpp +++ b/be/src/exec/odbc_scan_node.cpp @@ -24,6 +24,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -55,6 +56,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // get tuple desc _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); @@ -74,7 +76,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a odbc scanner failed."); } - _tuple_pool.reset(new (std::nothrow) MemPool(mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool("OdbcScanNode")); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); @@ -92,6 +94,7 @@ Status OdbcScanNode::prepare(RuntimeState* state) { } Status OdbcScanNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); VLOG_CRITICAL << _scan_node_type << "::Open"; @@ -128,6 +131,7 @@ Status OdbcScanNode::write_text_slot(char* value, int value_length, SlotDescript Status OdbcScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { VLOG_CRITICAL << _scan_node_type << "::GetNext"; + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (nullptr == state || nullptr == row_batch || nullptr == eos) { return Status::InternalError("input is nullptr pointer"); @@ -232,6 +236,7 @@ Status OdbcScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index 4b1f775bcdb72c..fa29b048bb346a 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -34,6 +34,7 @@ #include "runtime/runtime_filter_mgr.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "util/priority_thread_pool.hpp" #include "util/runtime_profile.h" @@ -55,7 +56,6 @@ OlapScanNode::OlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const Descr _transfer_done(false), _status(Status::OK()), _resource_info(nullptr), - _buffered_bytes(0), _eval_conjuncts_fn(nullptr), _runtime_filter_descs(tnode.runtime_filters) {} @@ -171,6 +171,7 @@ void OlapScanNode::_init_counter(RuntimeState* state) { Status OlapScanNode::prepare(RuntimeState* state) { init_scan_profile(); RETURN_IF_ERROR(ScanNode::prepare(state)); + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // create scanner profile // create timer _tablet_counter = ADD_COUNTER(runtime_profile(), "TabletCount ", TUnit::UNIT); @@ -179,6 +180,9 @@ Status OlapScanNode::prepare(RuntimeState* state) { _init_counter(state); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); + scanner_mem_tracker = MemTracker::create_virtual_tracker(state->instance_mem_tracker()->limit(), + "Scanners", mem_tracker()); + if (_tuple_desc == nullptr) { // TODO: make sure we print all available diagnostic output to our error log return Status::InternalError("Failed to get tuple descriptor."); @@ -212,6 +216,7 @@ Status OlapScanNode::prepare(RuntimeState* state) { } Status OlapScanNode::open(RuntimeState* state) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); VLOG_CRITICAL << "OlapScanNode::Open"; SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); @@ -255,6 +260,7 @@ Status OlapScanNode::open(RuntimeState* state) { } Status OlapScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -346,8 +352,6 @@ Status OlapScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eo << Tuple::to_string(row->get_tuple(0), *_tuple_desc); } } - __sync_fetch_and_sub(&_buffered_bytes, - row_batch->tuple_data_pool()->total_reserved_bytes()); delete materialized_batch; return Status::OK(); @@ -371,6 +375,7 @@ Status OlapScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); // change done status @@ -794,8 +799,9 @@ Status OlapScanNode::start_scan_thread(RuntimeState* state) { ++j, ++i) { scanner_ranges.push_back((*ranges)[i].get()); } - OlapScanner* scanner = new OlapScanner(state, this, _olap_scan_node.is_preaggregation, - _need_agg_finalize, *scan_range); + OlapScanner* scanner = + new OlapScanner(state, this, _olap_scan_node.is_preaggregation, + _need_agg_finalize, *scan_range, scanner_mem_tracker); // add scanner to pool before doing prepare. // so that scanner can be automatically deconstructed if prepare failed. _scanner_pool.add(scanner); @@ -1331,6 +1337,8 @@ Status OlapScanNode::normalize_bloom_filter_predicate(SlotDescriptor* slot) { void OlapScanNode::transfer_thread(RuntimeState* state) { // scanner open pushdown to scanThread + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); Status status = Status::OK(); for (auto scanner : _olap_scanners) { status = Expr::clone_if_not_exists(_conjunct_ctxs, state, scanner->conjunct_ctxs()); @@ -1358,13 +1366,8 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { _nice = 18 + std::max(0, 2 - (int)_olap_scanners.size() / 5); std::list olap_scanners; - int64_t mem_limit = 512 * 1024 * 1024; - // TODO(zc): use memory limit - int64_t mem_consume = __sync_fetch_and_add(&_buffered_bytes, 0); - if (state->fragment_mem_tracker() != nullptr) { - mem_limit = state->fragment_mem_tracker()->limit(); - mem_consume = state->fragment_mem_tracker()->consumption(); - } + int64_t mem_limit = scanner_mem_tracker->limit(); + int64_t mem_consume = scanner_mem_tracker->consumption(); int max_thread = _max_materialized_row_batches; if (config::doris_scanner_row_num > state->batch_size()) { max_thread /= config::doris_scanner_row_num / state->batch_size(); @@ -1383,13 +1386,9 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { { std::unique_lock l(_scan_batches_lock); assigned_thread_num = _running_thread; - // int64_t buf_bytes = __sync_fetch_and_add(&_buffered_bytes, 0); // How many thread can apply to this query size_t thread_slot_num = 0; - mem_consume = __sync_fetch_and_add(&_buffered_bytes, 0); - if (state->fragment_mem_tracker() != nullptr) { - mem_consume = state->fragment_mem_tracker()->consumption(); - } + mem_consume = scanner_mem_tracker->consumption(); if (mem_consume < (mem_limit * 6) / 10) { thread_slot_num = max_thread - assigned_thread_num; } else { @@ -1501,6 +1500,9 @@ void OlapScanNode::transfer_thread(RuntimeState* state) { } void OlapScanNode::scanner_thread(OlapScanner* scanner) { + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), + print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), mem_tracker()); if (UNLIKELY(_transfer_done)) { _scanner_done = true; std::unique_lock l(_scan_batches_lock); @@ -1580,8 +1582,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { << ", fragment id=" << print_id(_runtime_state->fragment_instance_id()); break; } - RowBatch* row_batch = new RowBatch(this->row_desc(), state->batch_size(), - _runtime_state->fragment_mem_tracker().get()); + RowBatch* row_batch = new RowBatch(this->row_desc(), state->batch_size()); row_batch->set_scanner_id(scanner->id()); status = scanner->get_batch(_runtime_state, row_batch, &eos); if (!status.ok()) { @@ -1596,8 +1597,6 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { row_batch = nullptr; } else { row_batchs.push_back(row_batch); - __sync_fetch_and_add(&_buffered_bytes, - row_batch->tuple_data_pool()->total_reserved_bytes()); } raw_rows_read = scanner->raw_rows_read(); } @@ -1678,7 +1677,6 @@ Status OlapScanNode::add_one_batch(RowBatch* row_batch) { return Status::OK(); } - vectorized::VExpr* OlapScanNode::_dfs_peel_conjunct(vectorized::VExpr* expr, int& leaf_index) { static constexpr auto is_leaf = [](vectorized::VExpr* expr) { return !expr->is_and_expr(); }; diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h index 82e98d5c0bac01..83150c747c20f4 100644 --- a/be/src/exec/olap_scan_node.h +++ b/be/src/exec/olap_scan_node.h @@ -248,7 +248,7 @@ class OlapScanNode : public ScanNode { TResourceInfo* _resource_info; - int64_t _buffered_bytes; + std::shared_ptr scanner_mem_tracker; EvalConjunctsFn _eval_conjuncts_fn; bool _need_agg_finalize = true; diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index d7dc83967b5abb..37cd62e97f4aa5 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -30,6 +30,7 @@ #include "runtime/descriptors.h" #include "runtime/mem_pool.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "runtime/runtime_state.h" #include "service/backend_options.h" #include "util/doris_metrics.h" @@ -39,7 +40,8 @@ namespace doris { OlapScanner::OlapScanner(RuntimeState* runtime_state, OlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range) + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker) : _runtime_state(runtime_state), _parent(parent), _tuple_desc(parent->_tuple_desc), @@ -48,16 +50,15 @@ OlapScanner::OlapScanner(RuntimeState* runtime_state, OlapScanNode* parent, bool _aggregation(aggregation), _need_agg_finalize(need_agg_finalize), _version(-1), - _mem_tracker(MemTracker::CreateTracker( - runtime_state->fragment_mem_tracker()->limit(), "OlapScanner", - runtime_state->fragment_mem_tracker(), true, true, MemTrackerLevel::VERBOSE)) { -} + _mem_tracker(MemTracker::create_tracker(tracker->limit(), + tracker->label() + ":OlapScanner", tracker)) {} Status OlapScanner::prepare( const TPaloScanRange& scan_range, const std::vector& key_ranges, const std::vector& filters, const std::vector>>& bloom_filters) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); set_tablet_reader(); // set limit to reduce end of rowset and segment mem use _tablet_reader->set_batch_size(_parent->limit() == -1 ? _parent->_runtime_state->batch_size() : std::min( @@ -93,7 +94,7 @@ Status OlapScanner::prepare( // the rowsets maybe compacted when the last olap scanner starts Version rd_version(0, _version); OLAPStatus acquire_reader_st = - _tablet->capture_rs_readers(rd_version, &_tablet_reader_params.rs_readers, _mem_tracker); + _tablet->capture_rs_readers(rd_version, &_tablet_reader_params.rs_readers); if (acquire_reader_st != OLAP_SUCCESS) { LOG(WARNING) << "fail to init reader.res=" << acquire_reader_st; std::stringstream ss; @@ -114,6 +115,7 @@ Status OlapScanner::prepare( } Status OlapScanner::open() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_parent->_reader_init_timer); if (_conjunct_ctxs.size() > _parent->_direct_conjunct_size) { @@ -257,13 +259,14 @@ Status OlapScanner::_init_return_columns() { } Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) { + // SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // 2. Allocate Row's Tuple buf uint8_t* tuple_buf = batch->tuple_data_pool()->allocate(state->batch_size() * _tuple_desc->byte_size()); bzero(tuple_buf, state->batch_size() * _tuple_desc->byte_size()); Tuple* tuple = reinterpret_cast(tuple_buf); - std::unique_ptr mem_pool(new MemPool(_mem_tracker.get())); + std::unique_ptr mem_pool(new MemPool()); int64_t raw_rows_threshold = raw_rows_read() + config::doris_scanner_row_num; { SCOPED_TIMER(_parent->_scan_timer); @@ -275,7 +278,7 @@ Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) { } // Read one row from reader auto res = _tablet_reader->next_row_with_aggregation(&_read_row_cursor, mem_pool.get(), - batch->agg_object_pool(), eof); + batch->agg_object_pool(), eof); if (res != OLAP_SUCCESS) { std::stringstream ss; ss << "Internal Error: read storage fail. res=" << res @@ -586,6 +589,7 @@ Status OlapScanner::close(RuntimeState* state) { if (_is_closed) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // olap scan node will call scanner.close() when finished // will release resources here // if not clear rowset readers in read_params here diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h index 0c684d9851b378..a7938d323f587e 100644 --- a/be/src/exec/olap_scanner.h +++ b/be/src/exec/olap_scanner.h @@ -47,7 +47,8 @@ class OlapScanNode; class OlapScanner { public: OlapScanner(RuntimeState* runtime_state, OlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range); + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker); virtual ~OlapScanner() = default; diff --git a/be/src/exec/orc_scanner.cpp b/be/src/exec/orc_scanner.cpp index 25031c3016ba38..2ea9f934f8f4fa 100644 --- a/be/src/exec/orc_scanner.cpp +++ b/be/src/exec/orc_scanner.cpp @@ -24,13 +24,12 @@ #include "exprs/expr.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/runtime_state.h" #include "runtime/tuple.h" #if defined(__x86_64__) - #include "exec/hdfs_file_reader.h" +#include "exec/hdfs_file_reader.h" #endif // orc include file didn't expose orc::TimezoneError diff --git a/be/src/exec/partitioned_aggregation_node.cc b/be/src/exec/partitioned_aggregation_node.cc index 98651998b6123b..4b006b1fd78436 100644 --- a/be/src/exec/partitioned_aggregation_node.cc +++ b/be/src/exec/partitioned_aggregation_node.cc @@ -41,6 +41,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" +#include "runtime/thread_context.h" #include "runtime/tuple.h" #include "runtime/tuple_row.h" #include "udf/udf_internal.h" @@ -152,8 +153,7 @@ Status PartitionedAggregationNode::init(const TPlanNode& tnode, RuntimeState* st DCHECK_EQ(intermediate_tuple_desc_->slots().size(), output_tuple_desc_->slots().size()); const RowDescriptor& row_desc = child(0)->row_desc(); - RETURN_IF_ERROR(Expr::create(tnode.agg_node.grouping_exprs, row_desc, state, &grouping_exprs_, - mem_tracker())); + RETURN_IF_ERROR(Expr::create(tnode.agg_node.grouping_exprs, row_desc, state, &grouping_exprs_)); // Construct build exprs from intermediate_row_desc_ for (int i = 0; i < grouping_exprs_.size(); ++i) { SlotDescriptor* desc = intermediate_tuple_desc_->slots()[i]; @@ -185,10 +185,11 @@ Status PartitionedAggregationNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); state_ = state; - mem_pool_.reset(new MemPool(mem_tracker().get())); - agg_fn_pool_.reset(new MemPool(expr_mem_tracker().get())); + mem_pool_.reset(new MemPool()); + agg_fn_pool_.reset(new MemPool()); ht_resize_timer_ = ADD_TIMER(runtime_profile(), "HTResizeTime"); get_results_timer_ = ADD_TIMER(runtime_profile(), "GetResultsTime"); @@ -231,20 +232,21 @@ Status PartitionedAggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(NewAggFnEvaluator::Create(agg_fns_, state, _pool, agg_fn_pool_.get(), &agg_fn_evals_, expr_mem_tracker(), row_desc)); - expr_results_pool_.reset(new MemPool(expr_mem_tracker().get())); + expr_results_pool_.reset(new MemPool(expr_mem_tracker())); if (!grouping_exprs_.empty()) { RowDescriptor build_row_desc(intermediate_tuple_desc_, false); RETURN_IF_ERROR(PartitionedHashTableCtx::Create( _pool, state, build_exprs_, grouping_exprs_, true, vector(build_exprs_.size(), true), state->fragment_hash_seed(), - MAX_PARTITION_DEPTH, 1, expr_mem_pool(), expr_results_pool_.get(), - expr_mem_tracker(), build_row_desc, row_desc, &ht_ctx_)); + MAX_PARTITION_DEPTH, 1, nullptr, expr_results_pool_.get(), expr_mem_tracker(), + build_row_desc, row_desc, &ht_ctx_)); } // AddCodegenDisabledMessage(state); return Status::OK(); } Status PartitionedAggregationNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // Open the child before consuming resources in this node. RETURN_IF_ERROR(child(0)->open(state)); @@ -293,7 +295,7 @@ Status PartitionedAggregationNode::open(RuntimeState* state) { // Streaming preaggregations do all processing in GetNext(). if (is_streaming_preagg_) return Status::OK(); - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(child(0)->row_desc(), state->batch_size()); // Read all the rows from the child and process them. bool eos = false; do { @@ -343,6 +345,7 @@ Status PartitionedAggregationNode::open(RuntimeState* state) { } Status PartitionedAggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // 1. `!need_finalize` means this aggregation node not the level two aggregation node // 2. `grouping_exprs_.size() == 0 ` means is not group by // 3. `child(0)->rows_returned() == 0` mean not data from child @@ -362,7 +365,7 @@ Status PartitionedAggregationNode::get_next(RuntimeState* state, RowBatch* row_b // TODO: if ancestor node don't have a no-spilling blocking node, we could avoid a deep_copy // we should a flag indicate this node don't have to deep_copy DCHECK_EQ(row_batch->num_rows(), 0); - RowBatch batch(row_batch->row_desc(), row_batch->capacity(), _mem_tracker.get()); + RowBatch batch(row_batch->row_desc(), row_batch->capacity()); int first_row_idx = batch.num_rows(); RETURN_IF_ERROR(GetNextInternal(state, &batch, eos)); RETURN_IF_ERROR(HandleOutputStrings(&batch, first_row_idx)); @@ -403,13 +406,14 @@ Status PartitionedAggregationNode::CopyStringData(const SlotDescriptor& slot_des Tuple* tuple = batch_iter.get()->get_tuple(0); StringValue* sv = reinterpret_cast(tuple->get_slot(slot_desc.tuple_offset())); if (sv == nullptr || sv->len == 0) continue; - char* new_ptr = reinterpret_cast(pool->try_allocate(sv->len)); + Status rst; + char* new_ptr = reinterpret_cast(pool->try_allocate(sv->len, &rst)); if (UNLIKELY(new_ptr == nullptr)) { string details = Substitute( "Cannot perform aggregation at node with id $0." " Failed to allocate $1 output bytes.", _id, sv->len); - return pool->mem_tracker()->MemLimitExceeded(state_, details, sv->len); + RETURN_ALLOC_LIMIT_EXCEEDED(pool->mem_tracker(), state_, details, sv->len, rst); } memcpy(new_ptr, sv->ptr, sv->len); sv->ptr = new_ptr; @@ -534,8 +538,7 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state, RowBatc DCHECK(is_streaming_preagg_); if (child_batch_ == nullptr) { - child_batch_.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + child_batch_.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); } do { @@ -686,6 +689,7 @@ Status PartitionedAggregationNode::reset(RuntimeState* state) { Status PartitionedAggregationNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (!singleton_output_tuple_returned_) { GetOutputTuple(agg_fn_evals_, singleton_output_tuple_, mem_pool_.get()); @@ -725,7 +729,7 @@ PartitionedAggregationNode::Partition::~Partition() { } Status PartitionedAggregationNode::Partition::InitStreams() { - agg_fn_pool.reset(new MemPool(parent->expr_mem_tracker().get())); + agg_fn_pool.reset(new MemPool(parent->expr_mem_tracker())); DCHECK_EQ(agg_fn_evals.size(), 0); NewAggFnEvaluator::ShallowClone(parent->partition_pool_.get(), agg_fn_pool.get(), parent->agg_fn_evals_, &agg_fn_evals); @@ -849,8 +853,7 @@ Status PartitionedAggregationNode::Partition::Spill(bool more_aggregate_rows) { // TODO(ml): enable spill std::stringstream msg; msg << "New partitioned Aggregation in spill"; - LIMIT_EXCEEDED(parent->state_->query_mem_tracker(), parent->state_, msg.str()); - // RETURN_IF_ERROR(parent->state_->StartSpilling(parent->mem_tracker())); + RETURN_LIMIT_EXCEEDED(parent->state_->query_mem_tracker(), parent->state_, msg.str()); RETURN_IF_ERROR(SerializeStreamForSpilling()); @@ -921,7 +924,8 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple( const int fixed_size = intermediate_tuple_desc_->byte_size(); const int varlen_size = GroupingExprsVarlenSize(); const int tuple_data_size = fixed_size + varlen_size; - uint8_t* tuple_data = pool->try_allocate(tuple_data_size); + Status rst; + uint8_t* tuple_data = pool->try_allocate(tuple_data_size, &rst); if (UNLIKELY(tuple_data == nullptr)) { stringstream str; str << "Memory exceed limit. Cannot perform aggregation at node with id $0. Failed " @@ -932,7 +936,7 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple( << ", Limit: " << pool->mem_tracker()->limit() << ". " << "You can change the limit by session variable exec_mem_limit."; string details = Substitute(str.str(), _id, tuple_data_size); - *status = pool->mem_tracker()->MemLimitExceeded(state_, details, tuple_data_size); + *status = pool->mem_tracker()->mem_limit_exceeded(state_, details, tuple_data_size, rst); return nullptr; } memset(tuple_data, 0, fixed_size); @@ -1347,7 +1351,7 @@ Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream3* input_str bool eos = false; const RowDescriptor* desc = AGGREGATED_ROWS ? &intermediate_row_desc_ : &(_children[0]->row_desc()); - RowBatch batch(*desc, state_->batch_size(), mem_tracker().get()); + RowBatch batch(*desc, state_->batch_size()); do { RETURN_IF_ERROR(input_stream->GetNext(&batch, &eos)); RETURN_IF_ERROR(ProcessBatch(&batch, ht_ctx_.get())); diff --git a/be/src/exec/partitioned_hash_table.cc b/be/src/exec/partitioned_hash_table.cc index b8cbdaab631b3c..11bdbfc8c70d65 100644 --- a/be/src/exec/partitioned_hash_table.cc +++ b/be/src/exec/partitioned_hash_table.cc @@ -151,7 +151,7 @@ Status PartitionedHashTableCtx::Open(RuntimeState* state) { void PartitionedHashTableCtx::Close(RuntimeState* state) { free(scratch_row_); scratch_row_ = nullptr; - expr_values_cache_.Close(tracker_); + expr_values_cache_.Close(); for (int i = 0; i < build_expr_evals_.size(); i++) { build_expr_evals_[i]->close(state); } @@ -310,13 +310,13 @@ Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, MAX_EXPR_VALUES_ARRAY_SIZE / expr_values_bytes_per_row_)); int mem_usage = MemUsage(capacity_, expr_values_bytes_per_row_, num_exprs_); - Status st = tracker->TryConsume(mem_usage); + Status st = tracker->check_limit(mem_usage); WARN_IF_ERROR(st, "PartitionedHashTableCtx::ExprValuesCache failed"); if (UNLIKELY(!st)) { capacity_ = 0; string details = Substitute( - "PartitionedHashTableCtx::ExprValuesCache failed to allocate $0 bytes.", mem_usage); - return tracker->MemLimitExceeded(state, details, mem_usage); + "PartitionedHashTableCtx::ExprValuesCache failed to allocate $0 bytes", mem_usage); + RETURN_ALLOC_LIMIT_EXCEEDED(tracker, state, details, mem_usage, st); } int expr_values_size = expr_values_bytes_per_row_ * capacity_; @@ -338,7 +338,7 @@ Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, return Status::OK(); } -void PartitionedHashTableCtx::ExprValuesCache::Close(const std::shared_ptr& tracker) { +void PartitionedHashTableCtx::ExprValuesCache::Close() { if (capacity_ == 0) return; cur_expr_values_ = nullptr; cur_expr_values_null_ = nullptr; @@ -348,8 +348,6 @@ void PartitionedHashTableCtx::ExprValuesCache::Close(const std::shared_ptrRelease(mem_usage); } int PartitionedHashTableCtx::ExprValuesCache::MemUsage(int capacity, int expr_values_bytes_per_row, diff --git a/be/src/exec/partitioned_hash_table.h b/be/src/exec/partitioned_hash_table.h index 23a9c3aaab9ee6..80007617d8ee81 100644 --- a/be/src/exec/partitioned_hash_table.h +++ b/be/src/exec/partitioned_hash_table.h @@ -211,8 +211,7 @@ class PartitionedHashTableCtx { const std::vector& build_exprs); /// Frees up various resources and updates memory tracker with proper accounting. - /// 'tracker' should be the same memory tracker which was passed in for Init(). - void Close(const std::shared_ptr& tracker); + void Close(); /// Resets the cache states (iterators, end pointers etc) before writing. void Reset() noexcept; diff --git a/be/src/exec/repeat_node.cpp b/be/src/exec/repeat_node.cpp index 78d937edd28c3f..439df40bab4cd1 100644 --- a/be/src/exec/repeat_node.cpp +++ b/be/src/exec/repeat_node.cpp @@ -22,6 +22,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris { @@ -44,6 +45,7 @@ RepeatNode::~RepeatNode() {} Status RepeatNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _runtime_state = state; _tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); if (_tuple_desc == nullptr) { @@ -54,6 +56,7 @@ Status RepeatNode::prepare(RuntimeState* state) { } Status RepeatNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -162,6 +165,7 @@ Status RepeatNode::get_repeated_batch(RowBatch* child_row_batch, int repeat_id_i } Status RepeatNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); DCHECK(_repeat_id_idx >= 0); @@ -175,8 +179,7 @@ Status RepeatNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) return Status::OK(); } - _child_row_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _child_row_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); RETURN_IF_ERROR(child(0)->get_next(state, _child_row_batch.get(), &_child_eos)); if (_child_row_batch->num_rows() <= 0) { @@ -203,6 +206,7 @@ Status RepeatNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _child_row_batch.reset(nullptr); RETURN_IF_ERROR(child(0)->close(state)); return ExecNode::close(state); diff --git a/be/src/exec/schema_scan_node.cpp b/be/src/exec/schema_scan_node.cpp index b393452883034e..e09d44ff7a7047 100644 --- a/be/src/exec/schema_scan_node.cpp +++ b/be/src/exec/schema_scan_node.cpp @@ -25,6 +25,7 @@ #include "gen_cpp/Types_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -99,9 +100,10 @@ Status SchemaScanNode::prepare(RuntimeState* state) { } RETURN_IF_ERROR(ScanNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // new one mem pool - _tuple_pool.reset(new (std::nothrow) MemPool(mem_tracker().get())); + _tuple_pool.reset(new (std::nothrow) MemPool()); if (nullptr == _tuple_pool.get()) { return Status::InternalError("Allocate MemPool failed."); @@ -187,6 +189,7 @@ Status SchemaScanNode::prepare(RuntimeState* state) { } Status SchemaScanNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (!_is_init) { return Status::InternalError("Open before Init."); } @@ -241,6 +244,7 @@ Status SchemaScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* } RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit()) { @@ -305,6 +309,7 @@ Status SchemaScanNode::close(RuntimeState* state) { return Status::OK(); } RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); _tuple_pool.reset(); diff --git a/be/src/exec/select_node.cpp b/be/src/exec/select_node.cpp index 25057686c49519..7648ef10b037e8 100644 --- a/be/src/exec/select_node.cpp +++ b/be/src/exec/select_node.cpp @@ -22,6 +22,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { @@ -33,12 +34,13 @@ SelectNode::SelectNode(ObjectPool* pool, const TPlanNode& tnode, const Descripto Status SelectNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - _child_row_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _child_row_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size())); return Status::OK(); } Status SelectNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(child(0)->open(state)); @@ -48,6 +50,7 @@ Status SelectNode::open(RuntimeState* state) { Status SelectNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (reached_limit() || (_child_row_idx == _child_row_batch->num_rows() && _child_eos)) { diff --git a/be/src/exec/set_operation_node.cpp b/be/src/exec/set_operation_node.cpp index 0ca6dd57c1fff6..488be80ea909e6 100644 --- a/be/src/exec/set_operation_node.cpp +++ b/be/src/exec/set_operation_node.cpp @@ -23,6 +23,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" namespace doris { SetOperationNode::SetOperationNode(ObjectPool* pool, const TPlanNode& tnode, @@ -38,9 +39,10 @@ Status SetOperationNode::init(const TPlanNode& tnode, RuntimeState* state) { Status SetOperationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _probe_timer = ADD_TIMER(runtime_profile(), "ProbeTime"); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -68,6 +70,7 @@ Status SetOperationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); for (auto& exprs : _child_expr_lists) { Expr::close(exprs, state); } @@ -134,6 +137,7 @@ bool SetOperationNode::equals(TupleRow* row, TupleRow* other) { Status SetOperationNode::open(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "SetOperation, while constructing the hash table."); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); // open result expr lists. @@ -143,7 +147,7 @@ Status SetOperationNode::open(RuntimeState* state) { // initial build hash table used for remove duplicated _hash_tbl.reset(new HashTable(_child_expr_lists[0], _child_expr_lists[1], _build_tuple_size, true, _find_nulls, id(), mem_tracker(), 1024)); - RowBatch build_batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch build_batch(child(0)->row_desc(), state->batch_size()); RETURN_IF_ERROR(child(0)->open(state)); bool eos = false; @@ -153,7 +157,6 @@ Status SetOperationNode::open(RuntimeState* state) { RETURN_IF_ERROR(child(0)->get_next(state, &build_batch, &eos)); // take ownership of tuple data of build_batch _build_pool->acquire_data(build_batch.tuple_data_pool(), false); - RETURN_IF_LIMIT_EXCEEDED(state, " SetOperation, while constructing the hash table."); // build hash table and remove duplicate items for (int i = 0; i < build_batch.num_rows(); ++i) { VLOG_ROW << "build row: " diff --git a/be/src/exec/spill_sort_node.cc b/be/src/exec/spill_sort_node.cc index ef527a18ede4e6..4fae0c042044ef 100644 --- a/be/src/exec/spill_sort_node.cc +++ b/be/src/exec/spill_sort_node.cc @@ -21,6 +21,7 @@ #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/sorted_run_merger.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" namespace doris { @@ -44,6 +45,7 @@ Status SpillSortNode::init(const TPlanNode& tnode, RuntimeState* state) { Status SpillSortNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(_sort_exec_exprs.prepare(state, child(0)->row_desc(), _row_descriptor, expr_mem_tracker())); // AddExprCtxsToFree(_sort_exec_exprs); @@ -51,6 +53,7 @@ Status SpillSortNode::prepare(RuntimeState* state) { } Status SpillSortNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(_sort_exec_exprs.open(state)); @@ -81,6 +84,7 @@ Status SpillSortNode::open(RuntimeState* state) { } Status SpillSortNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); // RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT, state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); @@ -153,7 +157,7 @@ void SpillSortNode::debug_string(int indentation_level, stringstream* out) const } Status SpillSortNode::sort_input(RuntimeState* state) { - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(child(0)->row_desc(), state->batch_size()); bool eos = false; do { batch.reset(); diff --git a/be/src/exec/table_function_node.cpp b/be/src/exec/table_function_node.cpp index 6eac8eb8243888..76a894b65a95fd 100644 --- a/be/src/exec/table_function_node.cpp +++ b/be/src/exec/table_function_node.cpp @@ -23,6 +23,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "exprs/table_function/table_function_factory.h" @@ -81,7 +82,7 @@ Status TableFunctionNode::_prepare_output_slot_ids(const TPlanNode& tnode) { Status TableFunctionNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(Expr::prepare(_fn_ctxs, state, _row_descriptor, expr_mem_tracker())); for (auto fn : _fns) { RETURN_IF_ERROR(fn->prepare()); @@ -90,6 +91,7 @@ Status TableFunctionNode::prepare(RuntimeState* state) { } Status TableFunctionNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(ExecNode::open(state)); @@ -182,6 +184,7 @@ bool TableFunctionNode::_roll_table_functions(int last_eos_idx) { // And the inner loop is to expand the row by table functions, and output row by row. Status TableFunctionNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); const RowDescriptor& parent_rowdesc = row_batch->row_desc(); @@ -203,7 +206,7 @@ Status TableFunctionNode::get_next(RuntimeState* state, RowBatch* row_batch, boo RETURN_IF_ERROR(state->check_query_state("TableFunctionNode, while getting next batch.")); if (_cur_child_batch == nullptr) { - _cur_child_batch.reset(new RowBatch(child_rowdesc, state->batch_size(), mem_tracker().get())); + _cur_child_batch.reset(new RowBatch(child_rowdesc, state->batch_size())); } if (_child_batch_exhausted) { if (_child_eos) { diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index cbeda55bd1394d..123cb521a416bf 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -18,7 +18,6 @@ #include "exec/tablet_info.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/row_batch.h" #include "runtime/tuple_row.h" #include "util/string_parser.hpp" @@ -161,8 +160,7 @@ OlapTablePartitionParam::OlapTablePartitionParam(std::shared_ptrtuple_desc()->slots()), - _mem_tracker(MemTracker::CreateTracker(-1, "OlapTablePartitionParam")) { + _mem_tracker(MemTracker::create_virtual_tracker(-1, "OlapTablePartitionParam")) { for (auto slot : _slots) { _partition_block.insert({slot->get_empty_mutable_column(), slot->get_data_type_ptr(), slot->col_name()}); } } VOlapTablePartitionParam::~VOlapTablePartitionParam() { - _mem_tracker->Release(_mem_usage); + _mem_tracker->release(_mem_usage); } Status VOlapTablePartitionParam::init() { @@ -509,7 +507,7 @@ Status VOlapTablePartitionParam::init() { } _mem_usage = _partition_block.allocated_bytes(); - _mem_tracker->Consume(_mem_usage); + _mem_tracker->consume(_mem_usage); return Status::OK(); } diff --git a/be/src/exec/tablet_info.h b/be/src/exec/tablet_info.h index f47566351a8c00..d51b306c533cea 100644 --- a/be/src/exec/tablet_info.h +++ b/be/src/exec/tablet_info.h @@ -36,7 +36,6 @@ namespace doris { class MemPool; -class MemTracker; class RowBatch; struct OlapTableIndexSchema { @@ -200,7 +199,6 @@ class OlapTablePartitionParam { std::vector _distributed_slot_descs; ObjectPool _obj_pool; - std::shared_ptr _mem_tracker; std::unique_ptr _mem_pool; std::vector _partitions; std::unique_ptr> diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index 590e1136d921dc..e8c285b6a9bfb6 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -29,6 +29,7 @@ #include "runtime/exec_env.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "service/backend_options.h" #include "service/brpc.h" @@ -50,6 +51,7 @@ NodeChannel::NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int if (_parent->_transfer_data_by_brpc_attachment) { _tuple_data_buffer_ptr = &_tuple_data_buffer; } + _node_channel_tracker = MemTracker::create_tracker(-1, "NodeChannel"); } NodeChannel::~NodeChannel() { @@ -71,6 +73,7 @@ NodeChannel::~NodeChannel() { // no need to set _cancel_msg because the error will be // returned directly via "TabletSink::prepare()" method. Status NodeChannel::init(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); _tuple_desc = _parent->_output_tuple_desc; auto node = _parent->_nodes_info->find_node(_node_id); if (node == nullptr) { @@ -84,7 +87,7 @@ Status NodeChannel::init(RuntimeState* state) { _row_desc.reset(new RowDescriptor(_tuple_desc, false)); _batch_size = state->batch_size(); - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); _stub = state->exec_env()->brpc_internal_client_cache()->get_client(_node_info.host, _node_info.brpc_port); @@ -112,6 +115,7 @@ Status NodeChannel::init(RuntimeState* state) { } void NodeChannel::open() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); PTabletWriterOpenRequest request; request.set_allocated_id(&_parent->_load_id); request.set_index_id(_index_channel->_index_id); @@ -156,6 +160,7 @@ void NodeChannel::_cancel_with_msg(const std::string& msg) { } Status NodeChannel::open_wait() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); _open_closure->join(); if (_open_closure->cntl.Failed()) { if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available( @@ -232,6 +237,7 @@ Status NodeChannel::open_wait() { } Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed. auto st = none_of({_cancelled, _eos_is_produced}); if (!st.ok()) { @@ -248,8 +254,7 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && - _pending_batches_num > 0) { + while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); } @@ -264,7 +269,7 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { _pending_batches_num++; } - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); _cur_add_batch_request.clear_tablet_ids(); row_no = _cur_batch->add_row(); @@ -297,8 +302,7 @@ Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && - _pending_batches_num > 0) { + while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); } @@ -313,7 +317,7 @@ Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { _pending_batches_num++; } - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); _cur_add_batch_request.clear_tablet_ids(); row_no = _cur_batch->add_row(); @@ -329,6 +333,7 @@ Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { } Status NodeChannel::mark_close() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); auto st = none_of({_cancelled, _eos_is_produced}); if (!st.ok()) { if (_cancelled) { @@ -355,6 +360,7 @@ Status NodeChannel::mark_close() { } Status NodeChannel::close_wait(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); auto st = none_of({_cancelled, !_eos_is_produced}); if (!st.ok()) { if (_cancelled) { @@ -402,6 +408,7 @@ Status NodeChannel::close_wait(RuntimeState* state) { } void NodeChannel::cancel(const std::string& cancel_msg) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); // we don't need to wait last rpc finished, cause closure's release/reset will join. // But do we need brpc::StartCancel(call_id)? _cancel_with_msg(cancel_msg); @@ -426,7 +433,8 @@ void NodeChannel::cancel(const std::string& cancel_msg) { request.release_id(); } -int NodeChannel::try_send_and_fetch_status(std::unique_ptr& thread_pool_token) { +int NodeChannel::try_send_and_fetch_status(RuntimeState* state, + std::unique_ptr& thread_pool_token) { auto st = none_of({_cancelled, _send_finished}); if (!st.ok()) { return 0; @@ -434,7 +442,8 @@ int NodeChannel::try_send_and_fetch_status(std::unique_ptr& thr bool is_finished = true; if (!_add_batch_closure->is_packet_in_flight() && _pending_batches_num > 0 && _last_patch_processed_finished.compare_exchange_strong(is_finished, false)) { - auto s = thread_pool_token->submit_func(std::bind(&NodeChannel::try_send_batch, this)); + auto s = thread_pool_token->submit_func( + std::bind(&NodeChannel::try_send_batch, this, state)); if (!s.ok()) { _cancel_with_msg("submit send_batch task to send_batch_thread_pool failed"); } @@ -442,7 +451,9 @@ int NodeChannel::try_send_and_fetch_status(std::unique_ptr& thr return _send_finished ? 0 : 1; } -void NodeChannel::try_send_batch() { +void NodeChannel::try_send_batch(RuntimeState* state) { + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), _node_channel_tracker); SCOPED_ATOMIC_TIMER(&_actual_consume_ns); AddBatchReq send_batch; { @@ -530,6 +541,7 @@ Status NodeChannel::none_of(std::initializer_list vars) { } void NodeChannel::clear_all_batches() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_node_channel_tracker); std::lock_guard lg(_pending_batches_lock); std::queue empty; std::swap(_pending_batches, empty); @@ -539,6 +551,7 @@ void NodeChannel::clear_all_batches() { IndexChannel::~IndexChannel() {} Status IndexChannel::init(RuntimeState* state, const std::vector& tablets) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_index_channel_tracker); for (auto& tablet : tablets) { auto location = _parent->_location->find_tablet(tablet.tablet_id); if (location == nullptr) { @@ -571,6 +584,7 @@ Status IndexChannel::init(RuntimeState* state, const std::vectorload_job_id()), - state->instance_mem_tracker(), true, false); - + MemTracker::create_tracker(-1, "OlapTableSink:" + std::to_string(state->load_job_id()), + state->instance_mem_tracker()); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); // Prepare the exprs to run. @@ -738,7 +752,7 @@ Status OlapTableSink::prepare(RuntimeState* state) { } _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false)); - _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size(), _mem_tracker.get())); + _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size())); _max_decimalv2_val.resize(_output_tuple_desc->slots().size()); _min_decimalv2_val.resize(_output_tuple_desc->slots().size()); @@ -809,6 +823,7 @@ Status OlapTableSink::prepare(RuntimeState* state) { } Status OlapTableSink::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); SCOPED_TIMER(_open_timer); // Prepare the exprs to run. @@ -839,13 +854,14 @@ Status OlapTableSink::open(RuntimeState* state) { _send_batch_thread_pool_token = state->exec_env()->send_batch_thread_pool()->new_token( ThreadPool::ExecutionMode::CONCURRENT, send_batch_parallelism); RETURN_IF_ERROR(Thread::create( - "OlapTableSink", "send_batch_process", [this]() { this->_send_batch_process(); }, - &_sender_thread)); + "OlapTableSink", "send_batch_process", + [this, state]() { this->_send_batch_process(state); }, &_sender_thread)); return Status::OK(); } Status OlapTableSink::send(RuntimeState* state, RowBatch* input_batch) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); // update incrementally so that FE can get the progress. // the real 'num_rows_load_total' will be set when sink being closed. @@ -928,6 +944,7 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { /// So here we use a flag to prevent repeated close operations. return _close_status; } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); Status status = close_status; if (status.ok()) { // only if status is ok can we call this _profile->total_time_counter(). @@ -1209,14 +1226,16 @@ Status OlapTableSink::_validate_data(RuntimeState* state, RowBatch* batch, Bitma return Status::OK(); } -void OlapTableSink::_send_batch_process() { +void OlapTableSink::_send_batch_process(RuntimeState* state) { SCOPED_TIMER(_non_blocking_send_timer); + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), _mem_tracker); do { int running_channels_num = 0; for (auto index_channel : _channels) { - index_channel->for_each_node_channel([&running_channels_num, this](const std::shared_ptr& ch) { + index_channel->for_each_node_channel([&running_channels_num, this, state](const std::shared_ptr& ch) { running_channels_num += - ch->try_send_and_fetch_status(this->_send_batch_thread_pool_token); + ch->try_send_and_fetch_status(state, this->_send_batch_thread_pool_token); }); } diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h index b31f3841c337a7..32fc7db582e820 100644 --- a/be/src/exec/tablet_sink.h +++ b/be/src/exec/tablet_sink.h @@ -176,9 +176,10 @@ class NodeChannel { // 1: running, haven't reach eos. // only allow 1 rpc in flight // plz make sure, this func should be called after open_wait(). - int try_send_and_fetch_status(std::unique_ptr& thread_pool_token); + int try_send_and_fetch_status(RuntimeState* state, + std::unique_ptr& thread_pool_token); - void try_send_batch(); + void try_send_batch(RuntimeState* state); void time_report(std::unordered_map* add_batch_counter_map, int64_t* serialize_batch_ns, int64_t* mem_exceeded_block_ns, @@ -201,7 +202,6 @@ class NodeChannel { Status none_of(std::initializer_list vars); - // TODO(HW): remove after mem tracker shared void clear_all_batches(); std::string channel_info() const { @@ -220,6 +220,8 @@ class NodeChannel { std::string _load_info; std::string _name; + std::shared_ptr _node_channel_tracker; + TupleDescriptor* _tuple_desc = nullptr; NodeInfo _node_info; @@ -279,7 +281,9 @@ class NodeChannel { class IndexChannel { public: IndexChannel(OlapTableSink* parent, int64_t index_id, int32_t schema_hash) - : _parent(parent), _index_id(index_id), _schema_hash(schema_hash) {} + : _parent(parent), _index_id(index_id), _schema_hash(schema_hash) { + _index_channel_tracker = MemTracker::create_tracker(-1, "IndexChannel"); + } ~IndexChannel(); Status init(RuntimeState* state, const std::vector& tablets); @@ -323,6 +327,8 @@ class IndexChannel { // key is tablet_id, value is error message std::unordered_map _failed_channels_msgs; Status _intolerable_failure_status = Status::OK(); + + std::shared_ptr _index_channel_tracker; }; // Write data to Olap Table. @@ -365,7 +371,7 @@ class OlapTableSink : public DataSink { // the consumer func of sending pending batches in every NodeChannel. // use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending. // only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer - void _send_batch_process(); + void _send_batch_process(RuntimeState* state); protected: friend class NodeChannel; diff --git a/be/src/exec/topn_node.cpp b/be/src/exec/topn_node.cpp index 7e98e1d329bfbb..3d8160ddb8f117 100644 --- a/be/src/exec/topn_node.cpp +++ b/be/src/exec/topn_node.cpp @@ -27,6 +27,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tuple.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" @@ -59,7 +60,8 @@ Status TopNNode::init(const TPlanNode& tnode, RuntimeState* state) { Status TopNNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); - _tuple_pool.reset(new MemPool(mem_tracker().get())); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _tuple_pool.reset(new MemPool()); RETURN_IF_ERROR(_sort_exec_exprs.prepare(state, child(0)->row_desc(), _row_descriptor, expr_mem_tracker())); // AddExprCtxsToFree(_sort_exec_exprs); @@ -74,6 +76,7 @@ Status TopNNode::prepare(RuntimeState* state) { } Status TopNNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -95,7 +98,7 @@ Status TopNNode::open(RuntimeState* state) { // Limit of 0, no need to fetch anything from children. if (_limit != 0) { - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); + RowBatch batch(child(0)->row_desc(), state->batch_size()); bool eos = false; do { @@ -126,6 +129,7 @@ Status TopNNode::open(RuntimeState* state) { } Status TopNNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); @@ -167,6 +171,7 @@ Status TopNNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_tuple_pool.get() != nullptr) { _tuple_pool->free_all(); } diff --git a/be/src/exec/union_node.cpp b/be/src/exec/union_node.cpp index cbb4bc9d5d27f0..f7f6c1d42a94e8 100644 --- a/be/src/exec/union_node.cpp +++ b/be/src/exec/union_node.cpp @@ -25,10 +25,9 @@ #include "runtime/tuple_row.h" // #include "util/runtime_profile_counters.h" #include "gen_cpp/PlanNodes_types.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" -// - namespace doris { UnionNode::UnionNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) @@ -69,6 +68,7 @@ Status UnionNode::init(const TPlanNode& tnode, RuntimeState* state) { Status UnionNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); _materialize_exprs_evaluate_timer = @@ -94,6 +94,7 @@ Status UnionNode::prepare(RuntimeState* state) { } Status UnionNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); // open const expr lists. @@ -154,8 +155,7 @@ Status UnionNode::get_next_materialized(RuntimeState* state, RowBatch* row_batch // Child row batch was either never set or we're moving on to a different child. if (_child_batch.get() == nullptr) { DCHECK_LT(_child_idx, _children.size()); - _child_batch.reset(new RowBatch(child(_child_idx)->row_desc(), state->batch_size(), - mem_tracker().get())); + _child_batch.reset(new RowBatch(child(_child_idx)->row_desc(), state->batch_size())); _child_row_idx = 0; // open the current child unless it's the first child, which was already opened in // UnionNode::open(). @@ -233,6 +233,7 @@ Status UnionNode::get_next_const(RuntimeState* state, RowBatch* row_batch) { } Status UnionNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); @@ -280,6 +281,7 @@ Status UnionNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) Status UnionNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); _child_batch.reset(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); for (auto& exprs : _const_expr_lists) { Expr::close(exprs, state); } diff --git a/be/src/exprs/agg_fn.h b/be/src/exprs/agg_fn.h index aa15a67c89848e..684c937cce8f91 100644 --- a/be/src/exprs/agg_fn.h +++ b/be/src/exprs/agg_fn.h @@ -27,7 +27,6 @@ namespace doris { using doris_udf::FunctionContext; class MemPool; -class MemTracker; class ObjectPool; class RuntimeState; class Tuple; diff --git a/be/src/exprs/agg_fn_evaluator.cpp b/be/src/exprs/agg_fn_evaluator.cpp index d83920c43bb20e..726575cb4d3c5c 100644 --- a/be/src/exprs/agg_fn_evaluator.cpp +++ b/be/src/exprs/agg_fn_evaluator.cpp @@ -149,7 +149,7 @@ Status AggFnEvaluator::prepare(RuntimeState* state, const RowDescriptor& desc, M _intermediate_slot_desc = intermediate_slot_desc; _string_buffer_len = 0; - _mem_tracker = mem_tracker; + _mem_tracker = MemTracker::create_virtual_tracker(-1, "AggFnEvaluator", mem_tracker); Status status = Expr::prepare(_input_exprs_ctxs, state, desc, _mem_tracker); RETURN_IF_ERROR(status); @@ -264,7 +264,7 @@ Status AggFnEvaluator::open(RuntimeState* state, FunctionContext* agg_fn_ctx) { void AggFnEvaluator::close(RuntimeState* state) { Expr::close(_input_exprs_ctxs, state); if (UNLIKELY(_total_mem_consumption > 0)) { - _mem_tracker->Release(_total_mem_consumption); + _mem_tracker->release(_total_mem_consumption); } } @@ -435,7 +435,7 @@ void AggFnEvaluator::update_mem_limlits(int len) { _accumulated_mem_consumption += len; // per 16M , update mem_tracker one time if (UNLIKELY(_accumulated_mem_consumption > 16777216)) { - _mem_tracker->Consume(_accumulated_mem_consumption); + _mem_tracker->consume(_accumulated_mem_consumption); _total_mem_consumption += _accumulated_mem_consumption; _accumulated_mem_consumption = 0; } diff --git a/be/src/exprs/anyval_util.cpp b/be/src/exprs/anyval_util.cpp index fabdb505cccb36..ed1f6ddab3ea41 100644 --- a/be/src/exprs/anyval_util.cpp +++ b/be/src/exprs/anyval_util.cpp @@ -38,9 +38,10 @@ Status allocate_any_val(RuntimeState* state, MemPool* pool, const TypeDescriptor const std::string& mem_limit_exceeded_msg, AnyVal** result) { const int anyval_size = AnyValUtil::any_val_size(type); const int anyval_alignment = AnyValUtil::any_val_alignment(type); - *result = reinterpret_cast(pool->try_allocate_aligned(anyval_size, anyval_alignment)); + Status rst; + *result = reinterpret_cast(pool->try_allocate_aligned(anyval_size, anyval_alignment, &rst)); if (*result == nullptr) { - return pool->mem_tracker()->MemLimitExceeded(state, mem_limit_exceeded_msg, anyval_size); + RETURN_ALLOC_LIMIT_EXCEEDED(pool->mem_tracker(), state, mem_limit_exceeded_msg, anyval_size, rst); } memset(static_cast(*result), 0, anyval_size); return Status::OK(); diff --git a/be/src/exprs/bloomfilter_predicate.h b/be/src/exprs/bloomfilter_predicate.h index a6b7f83636c4a6..f67dfc0c92a407 100644 --- a/be/src/exprs/bloomfilter_predicate.h +++ b/be/src/exprs/bloomfilter_predicate.h @@ -88,18 +88,19 @@ class IBloomFilterFuncBase { virtual Status assign(const char* data, int len) = 0; virtual Status get_data(char** data, int* len) = 0; - virtual MemTracker* tracker() = 0; virtual void light_copy(IBloomFilterFuncBase* other) = 0; }; template class BloomFilterFuncBase : public IBloomFilterFuncBase { public: - BloomFilterFuncBase(MemTracker* tracker) : _tracker(tracker), _inited(false) {} + BloomFilterFuncBase() : _inited(false) { + _tracker = MemTracker::create_virtual_tracker(-1, "BloomFilterFunc"); + } virtual ~BloomFilterFuncBase() { if (_tracker != nullptr) { - _tracker->Release(_bloom_filter_alloced); + _tracker->release(_bloom_filter_alloced); } } @@ -115,7 +116,7 @@ class BloomFilterFuncBase : public IBloomFilterFuncBase { _bloom_filter_alloced = bloom_filter_length; _bloom_filter.reset(BloomFilterAdaptor::create()); RETURN_IF_ERROR(_bloom_filter->init(bloom_filter_length)); - _tracker->Consume(_bloom_filter_alloced); + _tracker->consume(_bloom_filter_alloced); _inited = true; return Status::OK(); } @@ -138,7 +139,7 @@ class BloomFilterFuncBase : public IBloomFilterFuncBase { } _bloom_filter_alloced = len; - _tracker->Consume(_bloom_filter_alloced); + _tracker->consume(_bloom_filter_alloced); return _bloom_filter->init(data, len); } @@ -148,18 +149,16 @@ class BloomFilterFuncBase : public IBloomFilterFuncBase { return Status::OK(); } - MemTracker* tracker() override { return _tracker; } - void light_copy(IBloomFilterFuncBase* bloomfilter_func) override { auto other_func = static_cast(bloomfilter_func); - _tracker = nullptr; + _tracker = nullptr; // Avoid repeated release when ~BloomFilterFuncBase _bloom_filter_alloced = other_func->_bloom_filter_alloced; _bloom_filter = other_func->_bloom_filter; _inited = other_func->_inited; } protected: - MemTracker* _tracker; + std::shared_ptr _tracker; // bloom filter size int32_t _bloom_filter_alloced; std::shared_ptr _bloom_filter; @@ -298,7 +297,7 @@ struct BloomFilterTypeTraits { template class BloomFilterFunc final : public BloomFilterFuncBase { public: - BloomFilterFunc(MemTracker* tracker) : BloomFilterFuncBase(tracker) {} + BloomFilterFunc() : BloomFilterFuncBase() {} ~BloomFilterFunc() = default; diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 27aef88c921498..b8acae59646755 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -20,7 +20,6 @@ #include "exprs/bloomfilter_predicate.h" #include "exprs/hybrid_set.h" #include "exprs/minmax_predicate.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -28,7 +27,7 @@ class MinmaxFunctionTraits { public: using BasePtr = MinMaxFuncBase*; template - static BasePtr get_function([[maybe_unused]] MemTracker* tracker) { + static BasePtr get_function() { return new (std::nothrow) MinMaxNumFunc::CppType>(); }; }; @@ -37,7 +36,7 @@ class HybridSetTraits { public: using BasePtr = HybridSetBase*; template - static BasePtr get_function([[maybe_unused]] MemTracker* tracker) { + static BasePtr get_function() { using CppType = typename PrimitiveTypeTraits::CppType; using Set = std::conditional_t, StringValueSet, HybridSet>; @@ -49,8 +48,8 @@ class BloomFilterTraits { public: using BasePtr = IBloomFilterFuncBase*; template - static BasePtr get_function(MemTracker* tracker) { - return new BloomFilterFunc(tracker); + static BasePtr get_function() { + return new BloomFilterFunc(); }; }; @@ -58,49 +57,48 @@ template class PredicateFunctionCreator { public: template - static typename Traits::BasePtr create(MemTracker* tracker = nullptr) { - return Traits::template get_function(tracker); + static typename Traits::BasePtr create() { + return Traits::template get_function(); } }; template -typename Traits::BasePtr create_predicate_function(PrimitiveType type, - MemTracker* tracker = nullptr) { +typename Traits::BasePtr create_predicate_function(PrimitiveType type) { using Creator = PredicateFunctionCreator; switch (type) { case TYPE_BOOLEAN: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_TINYINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_SMALLINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_INT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_BIGINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_LARGEINT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_FLOAT: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DOUBLE: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DECIMALV2: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DATE: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_DATETIME: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_CHAR: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_VARCHAR: - return Creator::template create(tracker); + return Creator::template create(); case TYPE_STRING: - return Creator::template create(tracker); + return Creator::template create(); default: DCHECK(false) << "Invalid type."; @@ -117,8 +115,8 @@ inline auto create_set(PrimitiveType type) { return create_predicate_function(type); } -inline auto create_bloom_filter(MemTracker* tracker, PrimitiveType type) { - return create_predicate_function(type, tracker); +inline auto create_bloom_filter(PrimitiveType type) { + return create_predicate_function(type); } } // namespace doris \ No newline at end of file diff --git a/be/src/exprs/expr.cpp b/be/src/exprs/expr.cpp index 73f3775247f291..4c2d3520a2023c 100644 --- a/be/src/exprs/expr.cpp +++ b/be/src/exprs/expr.cpp @@ -815,15 +815,16 @@ void Expr::assign_fn_ctx_idx(int* next_fn_ctx_idx) { _fn_ctx_idx = *next_fn_ctx_idx; ++(*next_fn_ctx_idx); } - for (Expr* child : children()) child->assign_fn_ctx_idx(next_fn_ctx_idx); + for (Expr* child : children()) { + child->assign_fn_ctx_idx(next_fn_ctx_idx); + } _fn_ctx_idx_end = *next_fn_ctx_idx; } Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - ObjectPool* pool, Expr** scalar_expr, - const std::shared_ptr& tracker) { + ObjectPool* pool, Expr** scalar_expr) { *scalar_expr = nullptr; - Expr* root; + Expr* root = nullptr; RETURN_IF_ERROR(create_expr(pool, texpr.nodes[0], &root)); RETURN_IF_ERROR(create_tree(texpr, pool, root)); // TODO pengyubing replace by Init() @@ -844,12 +845,11 @@ Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeSt } Status Expr::create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, std::vector* exprs, - const std::shared_ptr& tracker) { + RuntimeState* state, ObjectPool* pool, std::vector* exprs) { exprs->clear(); for (const TExpr& texpr : texprs) { - Expr* expr; - RETURN_IF_ERROR(create(texpr, row_desc, state, pool, &expr, tracker)); + Expr* expr = nullptr; + RETURN_IF_ERROR(create(texpr, row_desc, state, pool, &expr)); DCHECK(expr != nullptr); exprs->push_back(expr); } @@ -857,14 +857,13 @@ Status Expr::create(const std::vector& texprs, const RowDescriptor& row_d } Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - Expr** scalar_expr, const std::shared_ptr& tracker) { - return Expr::create(texpr, row_desc, state, state->obj_pool(), scalar_expr, tracker); + Expr** scalar_expr) { + return Expr::create(texpr, row_desc, state, state->obj_pool(), scalar_expr); } Status Expr::create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, std::vector* exprs, - const std::shared_ptr& tracker) { - return Expr::create(texprs, row_desc, state, state->obj_pool(), exprs, tracker); + RuntimeState* state, std::vector* exprs) { + return Expr::create(texprs, row_desc, state, state->obj_pool(), exprs); } Status Expr::create_tree(const TExpr& texpr, ObjectPool* pool, Expr* root) { diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index 795dca6bc0884c..0004e9554fc276 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -179,23 +179,21 @@ class Expr { /// tuple row descriptor of the input tuple row. On failure, 'expr' is set to nullptr and /// the expr tree (if created) will be closed. Error status will be returned too. static Status create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - ObjectPool* pool, Expr** expr, const std::shared_ptr& tracker); + ObjectPool* pool, Expr** expr); /// Create a new ScalarExpr based on thrift Expr 'texpr'. The newly created ScalarExpr /// is stored in ObjectPool 'state->obj_pool()' and returned in 'expr'. 'row_desc' is /// the tuple row descriptor of the input tuple row. Returns error status on failure. static Status create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, - Expr** expr, const std::shared_ptr& tracker); + Expr** expr); /// Convenience functions creating multiple ScalarExpr. static Status create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, std::vector* exprs, - const std::shared_ptr& tracker); + RuntimeState* state, ObjectPool* pool, std::vector* exprs); /// Convenience functions creating multiple ScalarExpr. static Status create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, std::vector* exprs, - const std::shared_ptr& tracker); + RuntimeState* state, std::vector* exprs); /// Convenience function for preparing multiple expr trees. /// Allocations from 'ctxs' will be counted against 'tracker'. diff --git a/be/src/exprs/expr_context.cpp b/be/src/exprs/expr_context.cpp index 40e93ee66a14fc..d97a09a433216e 100644 --- a/be/src/exprs/expr_context.cpp +++ b/be/src/exprs/expr_context.cpp @@ -28,6 +28,7 @@ #include "runtime/mem_tracker.h" #include "runtime/raw_value.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "udf/udf_internal.h" #include "util/debug_util.h" #include "util/stack_util.h" @@ -49,15 +50,17 @@ ExprContext::~ExprContext() { } } -// TODO(zc): memory tracker Status ExprContext::prepare(RuntimeState* state, const RowDescriptor& row_desc, const std::shared_ptr& tracker) { DCHECK(tracker != nullptr) << std::endl << get_stack_trace(); + if (_prepared) { + return Status::OK(); + } + _mem_tracker = tracker; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(_pool.get() == nullptr); _prepared = true; - // TODO: use param tracker to replace instance_mem_tracker, be careful about tracker's life cycle - // _pool.reset(new MemPool(new MemTracker(-1))); - _pool.reset(new MemPool(state->instance_mem_tracker().get())); + _pool.reset(new MemPool()); return _root->prepare(state, row_desc, this); } @@ -66,6 +69,7 @@ Status ExprContext::open(RuntimeState* state) { if (_opened) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _opened = true; // Fragment-local state is only initialized for original contexts. Clones inherit the // original's fragment state and only need to have thread-local state initialized. @@ -84,6 +88,7 @@ Status ExprContext::open(std::vector evals, RuntimeState* state) { void ExprContext::close(RuntimeState* state) { DCHECK(!_closed); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); FunctionContext::FunctionStateScope scope = _is_clone ? FunctionContext::THREAD_LOCAL : FunctionContext::FRAGMENT_LOCAL; _root->close(state, this, scope); @@ -112,9 +117,10 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx) { DCHECK(_prepared); DCHECK(_opened); DCHECK(*new_ctx == nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); *new_ctx = state->obj_pool()->add(new ExprContext(_root)); - (*new_ctx)->_pool.reset(new MemPool(_pool->mem_tracker())); + (*new_ctx)->_pool.reset(new MemPool()); for (int i = 0; i < _fn_contexts.size(); ++i) { (*new_ctx)->_fn_contexts.push_back(_fn_contexts[i]->impl()->clone((*new_ctx)->_pool.get())); } @@ -123,6 +129,7 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx) { (*new_ctx)->_is_clone = true; (*new_ctx)->_prepared = true; (*new_ctx)->_opened = true; + (*new_ctx)->_mem_tracker = _mem_tracker; return _root->open(state, *new_ctx, FunctionContext::THREAD_LOCAL); } @@ -132,8 +139,9 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx, Expr* root DCHECK(_opened); DCHECK(*new_ctx == nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); *new_ctx = state->obj_pool()->add(new ExprContext(root)); - (*new_ctx)->_pool.reset(new MemPool(_pool->mem_tracker())); + (*new_ctx)->_pool.reset(new MemPool()); for (int i = 0; i < _fn_contexts.size(); ++i) { (*new_ctx)->_fn_contexts.push_back(_fn_contexts[i]->impl()->clone((*new_ctx)->_pool.get())); } @@ -142,11 +150,13 @@ Status ExprContext::clone(RuntimeState* state, ExprContext** new_ctx, Expr* root (*new_ctx)->_is_clone = true; (*new_ctx)->_prepared = true; (*new_ctx)->_opened = true; + (*new_ctx)->_mem_tracker = _mem_tracker; return root->open(state, *new_ctx, FunctionContext::THREAD_LOCAL); } void ExprContext::free_local_allocations() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); free_local_allocations(_fn_contexts); } @@ -371,10 +381,11 @@ Status ExprContext::get_const_value(RuntimeState* state, Expr& expr, AnyVal** co StringVal* sv = reinterpret_cast(*const_val); if (!sv->is_null && sv->len > 0) { // Make sure the memory is owned by this evaluator. - char* ptr_copy = reinterpret_cast(_pool->try_allocate(sv->len)); + Status rst; + char* ptr_copy = reinterpret_cast(_pool->try_allocate(sv->len, &rst)); if (ptr_copy == nullptr) { - return _pool->mem_tracker()->MemLimitExceeded( - state, "Could not allocate constant string value", sv->len); + RETURN_ALLOC_LIMIT_EXCEEDED(_pool->mem_tracker(), state, + "Could not allocate constant string value", sv->len, rst); } memcpy(ptr_copy, sv->ptr, sv->len); sv->ptr = reinterpret_cast(ptr_copy); diff --git a/be/src/exprs/expr_context.h b/be/src/exprs/expr_context.h index f176240f720f2b..9de41f169c7688 100644 --- a/be/src/exprs/expr_context.h +++ b/be/src/exprs/expr_context.h @@ -170,6 +170,8 @@ class ExprContext { /// TODO: revisit this FunctionContext** _fn_contexts_ptr; + std::shared_ptr _mem_tracker; + /// Pool backing fn_contexts_. Counts against the runtime state's UDF mem tracker. std::unique_ptr _pool; diff --git a/be/src/exprs/new_agg_fn_evaluator.cc b/be/src/exprs/new_agg_fn_evaluator.cc index 7a2209ba7fedab..17f8a931f31037 100644 --- a/be/src/exprs/new_agg_fn_evaluator.cc +++ b/be/src/exprs/new_agg_fn_evaluator.cc @@ -90,19 +90,13 @@ typedef AnyVal (*FinalizeFn)(FunctionContext*, const AnyVal&); const int DEFAULT_MULTI_DISTINCT_COUNT_STRING_BUFFER_SIZE = 1024; -NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, - const std::shared_ptr& tracker, bool is_clone) - : _total_mem_consumption(0), - _accumulated_mem_consumption(0), +NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, bool is_clone) + : _accumulated_mem_consumption(0), is_clone_(is_clone), agg_fn_(agg_fn), - mem_pool_(mem_pool), - _mem_tracker(tracker) {} + mem_pool_(mem_pool) {} NewAggFnEvaluator::~NewAggFnEvaluator() { - if (UNLIKELY(_total_mem_consumption > 0)) { - _mem_tracker->Release(_total_mem_consumption); - } DCHECK(closed_); } @@ -122,7 +116,7 @@ Status NewAggFnEvaluator::Create(const AggFn& agg_fn, RuntimeState* state, Objec // Create a new AggFn evaluator. NewAggFnEvaluator* agg_fn_eval = - pool->add(new NewAggFnEvaluator(agg_fn, mem_pool, tracker, false)); + pool->add(new NewAggFnEvaluator(agg_fn, mem_pool, false)); agg_fn_eval->agg_fn_ctx_.reset(FunctionContextImpl::create_context( state, mem_pool, agg_fn.GetIntermediateTypeDesc(), agg_fn.GetOutputTypeDesc(), @@ -633,7 +627,7 @@ void NewAggFnEvaluator::SerializeOrFinalize(Tuple* src, const SlotDescriptor& ds void NewAggFnEvaluator::ShallowClone(ObjectPool* pool, MemPool* mem_pool, NewAggFnEvaluator** cloned_eval) const { DCHECK(opened_); - *cloned_eval = pool->add(new NewAggFnEvaluator(agg_fn_, mem_pool, _mem_tracker, true)); + *cloned_eval = pool->add(new NewAggFnEvaluator(agg_fn_, mem_pool, true)); (*cloned_eval)->agg_fn_ctx_.reset(agg_fn_ctx_->impl()->clone(mem_pool)); DCHECK_EQ((*cloned_eval)->input_evals_.size(), 0); (*cloned_eval)->input_evals_ = input_evals_; diff --git a/be/src/exprs/new_agg_fn_evaluator.h b/be/src/exprs/new_agg_fn_evaluator.h index 36bdc2f21c4dbc..462c4705a174c5 100644 --- a/be/src/exprs/new_agg_fn_evaluator.h +++ b/be/src/exprs/new_agg_fn_evaluator.h @@ -188,7 +188,6 @@ class NewAggFnEvaluator { static std::string DebugString(const std::vector& evals); private: - uint64_t _total_mem_consumption; uint64_t _accumulated_mem_consumption; // index if has multi count distinct @@ -209,8 +208,6 @@ class NewAggFnEvaluator { /// Owned by the exec node which owns this evaluator. MemPool* mem_pool_ = nullptr; - std::shared_ptr _mem_tracker; // saved c'tor param - /// This contains runtime state such as constant input arguments to the aggregate /// functions and a FreePool from which the intermediate values are allocated. /// Owned by this evaluator. @@ -231,8 +228,7 @@ class NewAggFnEvaluator { doris_udf::AnyVal* staging_merge_input_val_ = nullptr; /// Use Create() instead. - NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, - const std::shared_ptr& tracker, bool is_clone); + NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, bool is_clone); /// Return the intermediate type of the aggregate function. inline const SlotDescriptor& intermediate_slot_desc() const; diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 4b603ea4ed1369..6e134578673371 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -318,18 +318,17 @@ BinaryPredicate* create_bin_predicate(ObjectPool* pool, PrimitiveType prim_type, // This class is a wrapper of runtime predicate function class RuntimePredicateWrapper { public: - RuntimePredicateWrapper(RuntimeState* state, MemTracker* tracker, ObjectPool* pool, + RuntimePredicateWrapper(RuntimeState* state, ObjectPool* pool, const RuntimeFilterParams* params) - : _tracker(tracker), - _pool(pool), + : _pool(pool), _column_return_type(params->column_return_type), _filter_type(params->filter_type), _fragment_instance_id(params->fragment_instance_id), _filter_id(params->filter_id) {} // for a 'tmp' runtime predicate wrapper // only could called assign method or as a param for merge - RuntimePredicateWrapper(MemTracker* tracker, ObjectPool* pool, RuntimeFilterType type, UniqueId fragment_instance_id, uint32_t filter_id) - : _tracker(tracker), _pool(pool), _filter_type(type), _fragment_instance_id(fragment_instance_id), _filter_id(filter_id) {} + RuntimePredicateWrapper(ObjectPool* pool, RuntimeFilterType type, UniqueId fragment_instance_id, uint32_t filter_id) + : _pool(pool), _filter_type(type), _fragment_instance_id(fragment_instance_id), _filter_id(filter_id) {} // init runtime filter wrapper // alloc memory to init runtime filter function Status init(const RuntimeFilterParams* params) { @@ -345,12 +344,12 @@ class RuntimePredicateWrapper { } case RuntimeFilterType::BLOOM_FILTER: { _is_bloomfilter = true; - _bloomfilter_func.reset(create_bloom_filter(_tracker, _column_return_type)); + _bloomfilter_func.reset(create_bloom_filter(_column_return_type)); return _bloomfilter_func->init_with_fixed_length(params->bloom_filter_size); } case RuntimeFilterType::IN_OR_BLOOM_FILTER: { _hybrid_set.reset(create_set(_column_return_type)); - _bloomfilter_func.reset(create_bloom_filter(_tracker, _column_return_type)); + _bloomfilter_func.reset(create_bloom_filter(_column_return_type)); return _bloomfilter_func->init_with_fixed_length(params->bloom_filter_size); } default: @@ -622,8 +621,6 @@ class RuntimePredicateWrapper { } Status assign(const PInFilter* in_filter) { - DCHECK(_tracker != nullptr); - PrimitiveType type = to_primitive_type(in_filter->column_type()); if (in_filter->has_ignored_msg()) { VLOG_DEBUG << "Ignore in filter(id=" << _filter_id << ") because: " << in_filter->ignored_msg(); @@ -726,18 +723,16 @@ class RuntimePredicateWrapper { // used by shuffle runtime filter // assign this filter by protobuf Status assign(const PBloomFilter* bloom_filter, const char* data) { - DCHECK(_tracker != nullptr); _is_bloomfilter = true; // we won't use this class to insert or find any data // so any type is ok - _bloomfilter_func.reset(create_bloom_filter(_tracker, PrimitiveType::TYPE_INT)); + _bloomfilter_func.reset(create_bloom_filter(PrimitiveType::TYPE_INT)); return _bloomfilter_func->assign(data, bloom_filter->filter_length()); } // used by shuffle runtime filter // assign this filter by protobuf Status assign(const PMinMaxFilter* minmax_filter) { - DCHECK(_tracker != nullptr); PrimitiveType type = to_primitive_type(minmax_filter->column_type()); _minmax_func.reset(create_minmax_filter(type)); switch (type) { @@ -890,7 +885,6 @@ class RuntimePredicateWrapper { } private: - MemTracker* _tracker; ObjectPool* _pool; PrimitiveType _column_return_type; // column type RuntimeFilterType _filter_type; @@ -905,10 +899,10 @@ class RuntimePredicateWrapper { uint32_t _filter_id; }; -Status IRuntimeFilter::create(RuntimeState* state, MemTracker* tracker, ObjectPool* pool, +Status IRuntimeFilter::create(RuntimeState* state, ObjectPool* pool, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, int node_id, IRuntimeFilter** res) { - *res = pool->add(new IRuntimeFilter(state, tracker, pool)); + *res = pool->add(new IRuntimeFilter(state, pool)); (*res)->set_role(role); UniqueId fragment_instance_id(state->fragment_instance_id()); return (*res)->init_with_desc(desc, query_options, fragment_instance_id, node_id); @@ -1048,7 +1042,7 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue RETURN_IF_ERROR(Expr::create_expr_tree(_pool, iter->second, &_probe_ctx)); } - _wrapper = _pool->add(new RuntimePredicateWrapper(_state, _mem_tracker, _pool, ¶ms)); + _wrapper = _pool->add(new RuntimePredicateWrapper(_state, _pool, ¶ms)); return _wrapper->init(¶ms); } @@ -1060,16 +1054,14 @@ Status IRuntimeFilter::serialize(PPublishFilterRequest* request, void** data, in return serialize_impl(request, data, len); } -Status IRuntimeFilter::create_wrapper(const MergeRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, +Status IRuntimeFilter::create_wrapper(const MergeRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper) { - return _create_wrapper(param, tracker, pool, wrapper); + return _create_wrapper(param, pool, wrapper); } -Status IRuntimeFilter::create_wrapper(const UpdateRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, +Status IRuntimeFilter::create_wrapper(const UpdateRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper) { - return _create_wrapper(param, tracker, pool, wrapper); + return _create_wrapper(param, pool, wrapper); } void IRuntimeFilter::change_to_bloom_filter() { @@ -1081,10 +1073,10 @@ void IRuntimeFilter::change_to_bloom_filter() { } template -Status IRuntimeFilter::_create_wrapper(const T* param, MemTracker* tracker, ObjectPool* pool, +Status IRuntimeFilter::_create_wrapper(const T* param, ObjectPool* pool, std::unique_ptr* wrapper) { int filter_type = param->request->filter_type(); - wrapper->reset(new RuntimePredicateWrapper(tracker, pool, get_type(filter_type), + wrapper->reset(new RuntimePredicateWrapper(pool, get_type(filter_type), UniqueId(param->request->fragment_id()), param->request->filter_id())); switch (filter_type) { @@ -1383,7 +1375,7 @@ Status IRuntimeFilter::update_filter(const UpdateRuntimeFilterParams* param) { set_ignored_msg(*msg); } std::unique_ptr wrapper; - RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, _mem_tracker, _pool, &wrapper)); + RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, _pool, &wrapper)); auto origin_type = _wrapper->get_real_type(); RETURN_IF_ERROR(_wrapper->merge(wrapper.get())); if (origin_type != _wrapper->get_real_type()) { diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 663843d2b68041..8d5b433b9b38ef 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -106,9 +106,8 @@ struct MergeRuntimeFilterParams { /// that can be pushed down to node based on the results of the right table. class IRuntimeFilter { public: - IRuntimeFilter(RuntimeState* state, MemTracker* mem_tracker, ObjectPool* pool) + IRuntimeFilter(RuntimeState* state, ObjectPool* pool) : _state(state), - _mem_tracker(mem_tracker), _pool(pool), _runtime_filter_type(RuntimeFilterType::UNKNOWN_FILTER), _filter_id(-1), @@ -124,7 +123,7 @@ class IRuntimeFilter { ~IRuntimeFilter() = default; - static Status create(RuntimeState* state, MemTracker* tracker, ObjectPool* pool, + static Status create(RuntimeState* state, ObjectPool* pool, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, int node_id, IRuntimeFilter** res); @@ -191,11 +190,9 @@ class IRuntimeFilter { // for ut const RuntimePredicateWrapper* get_wrapper(); - static Status create_wrapper(const MergeRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, + static Status create_wrapper(const MergeRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper); - static Status create_wrapper(const UpdateRuntimeFilterParams* param, MemTracker* tracker, - ObjectPool* pool, + static Status create_wrapper(const UpdateRuntimeFilterParams* param, ObjectPool* pool, std::unique_ptr* wrapper); void change_to_bloom_filter(); Status update_filter(const UpdateRuntimeFilterParams* param); @@ -234,11 +231,10 @@ class IRuntimeFilter { Status serialize_impl(T* request, void** data, int* len); template - static Status _create_wrapper(const T* param, MemTracker* tracker, ObjectPool* pool, + static Status _create_wrapper(const T* param, ObjectPool* pool, std::unique_ptr* wrapper); RuntimeState* _state; - MemTracker* _mem_tracker; ObjectPool* _pool; // _wrapper is a runtime filter function wrapper // _wrapper should alloc from _pool diff --git a/be/src/gutil/strings/numbers.cc b/be/src/gutil/strings/numbers.cc index 5027dea46b89d4..6cc76d24850ffa 100644 --- a/be/src/gutil/strings/numbers.cc +++ b/be/src/gutil/strings/numbers.cc @@ -1479,6 +1479,41 @@ string ItoaKMGT(int64 i) { return StringPrintf("%s%" PRId64 "%s", sign, val, suffix); } +string AccurateItoaKMGT(int64 i) { + const char *sign = ""; + if (i < 0) { + // We lose some accuracy if the caller passes LONG_LONG_MIN, but + // that's OK as this function is only for human readability + if (i == numeric_limits::min()) i++; + sign = "-"; + i = -i; + } + + string ret = StringPrintf("%s", sign); + int64 val; + if ((val = (i >> 40)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "T"); + i = i - (val << 40); + } + if ((val = (i >> 30)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "G"); + i = i - (val << 30); + } + if ((val = (i >> 20)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "M"); + i = i - (val << 20); + } + if ((val = (i >> 10)) > 1) { + ret += StringPrintf(" %" PRId64 "%s", val, "K"); + i = i - (val << 10); + } else { + ret += StringPrintf(" %" PRId64 "%s", i, "K"); + } + + return ret; +} + + // DEPRECATED(wadetregaskis). // These are non-inline because some BUILD files turn on -Wformat-non-literal. diff --git a/be/src/gutil/strings/numbers.h b/be/src/gutil/strings/numbers.h index 00a10d37a81ee5..01540d29008683 100644 --- a/be/src/gutil/strings/numbers.h +++ b/be/src/gutil/strings/numbers.h @@ -474,8 +474,12 @@ char* SimpleItoaWithCommas(__int128_t i, char* buffer, int32_t buffer_size); // e.g. 3000 -> 2K 57185920 -> 45M // // Return value: string +// +// AccurateItoaKMGT() +// Description: preserve accuracy // ---------------------------------------------------------------------- string ItoaKMGT(int64 i); +string AccurateItoaKMGT(int64 i); // ---------------------------------------------------------------------- // ParseDoubleRange() diff --git a/be/src/http/action/compaction_action.cpp b/be/src/http/action/compaction_action.cpp index 6c52c9165d8b81..b228a2b58d968d 100644 --- a/be/src/http/action/compaction_action.cpp +++ b/be/src/http/action/compaction_action.cpp @@ -30,6 +30,7 @@ #include "http/http_response.h" #include "http/http_status.h" #include "olap/base_compaction.h" +#include "runtime/thread_context.h" #include "olap/cumulative_compaction.h" #include "olap/olap_define.h" #include "olap/storage_engine.h" @@ -225,8 +226,7 @@ OLAPStatus CompactionAction::_execute_compaction_callback(TabletSharedPtr tablet OLAPStatus status = OLAP_SUCCESS; if (compaction_type == PARAM_COMPACTION_BASE) { - std::string tracker_label = "CompactionAction:BaseCompaction:" + std::to_string(syscall(__NR_gettid)); - BaseCompaction base_compaction(tablet, tracker_label, _compaction_mem_tracker); + BaseCompaction base_compaction(tablet); OLAPStatus res = base_compaction.compact(); if (res != OLAP_SUCCESS && res != OLAP_ERR_BE_NO_SUITABLE_VERSION) { DorisMetrics::instance()->base_compaction_request_failed->increment(1); @@ -235,8 +235,7 @@ OLAPStatus CompactionAction::_execute_compaction_callback(TabletSharedPtr tablet } status = res; } else if (compaction_type == PARAM_COMPACTION_CUMULATIVE) { - std::string tracker_label = "CompactionAction:CumulativeCompaction:" + std::to_string(syscall(__NR_gettid)); - CumulativeCompaction cumulative_compaction(tablet, tracker_label, _compaction_mem_tracker); + CumulativeCompaction cumulative_compaction(tablet); OLAPStatus res = cumulative_compaction.compact(); if (res != OLAP_SUCCESS && res != OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSIONS) { DorisMetrics::instance()->cumulative_compaction_request_failed->increment(1); @@ -254,6 +253,7 @@ OLAPStatus CompactionAction::_execute_compaction_callback(TabletSharedPtr tablet } void CompactionAction::handle(HttpRequest* req) { + SCOPED_ATTACH_TASK_THREAD_2ARG(ThreadContext::TaskType::COMPACTION, _compaction_mem_tracker); req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); if (_type == CompactionActionType::SHOW_INFO) { diff --git a/be/src/http/action/compaction_action.h b/be/src/http/action/compaction_action.h index a989c9b8293f6b..8138279aeeb298 100644 --- a/be/src/http/action/compaction_action.h +++ b/be/src/http/action/compaction_action.h @@ -39,10 +39,11 @@ const std::string PARAM_COMPACTION_CUMULATIVE = "cumulative"; /// See compaction-action.md for details. class CompactionAction : public HttpHandler { public: - CompactionAction(CompactionActionType type) - : _type(type) { - _compaction_mem_tracker = type == RUN_COMPACTION ? - MemTracker::CreateTracker(-1, "ManualCompaction", nullptr, false, false, MemTrackerLevel::TASK) : nullptr; + CompactionAction(CompactionActionType type) : _type(type) { + _compaction_mem_tracker = + type == RUN_COMPACTION ? MemTracker::create_tracker(-1, "ManualCompaction", nullptr, + MemTrackerLevel::TASK) + : nullptr; } virtual ~CompactionAction() {} diff --git a/be/src/http/default_path_handlers.cpp b/be/src/http/default_path_handlers.cpp index d8416970dc00fe..2b7803344b6a9d 100644 --- a/be/src/http/default_path_handlers.cpp +++ b/be/src/http/default_path_handlers.cpp @@ -144,12 +144,22 @@ void mem_tracker_handler(const WebPageHandler::ArgumentMap& args, std::stringstr (*output) << "\n"; std::vector> trackers; - MemTracker::ListTrackers(&trackers); + MemTracker::list_process_trackers(&trackers); for (const shared_ptr& tracker : trackers) { string parent = tracker->parent() == nullptr ? "none" : tracker->parent()->label(); - string limit_str = tracker->limit() == -1 ? "none" : ItoaKMGT(tracker->limit()); - string current_consumption_str = ItoaKMGT(tracker->consumption()); - string peak_consumption_str = ItoaKMGT(tracker->peak_consumption()); + string limit_str; + string current_consumption_str; + string peak_consumption_str; + if (!config::memory_leak_detection) { + limit_str = tracker->limit() == -1 ? "none" : ItoaKMGT(tracker->limit()); + current_consumption_str = ItoaKMGT(tracker->consumption()); + peak_consumption_str = ItoaKMGT(tracker->peak_consumption()); + } else { + limit_str = tracker->limit() == -1 ? "none" : AccurateItoaKMGT(tracker->limit()); + current_consumption_str = AccurateItoaKMGT(tracker->consumption()); + peak_consumption_str = AccurateItoaKMGT(tracker->peak_consumption()); + } + int64_t use_count = tracker.use_count(); (*output) << strings::Substitute( "$0$1$2" // id, parent, limit diff --git a/be/src/olap/aggregate_func.h b/be/src/olap/aggregate_func.h index f1996330f1c4eb..1a84806952c995 100644 --- a/be/src/olap/aggregate_func.h +++ b/be/src/olap/aggregate_func.h @@ -24,7 +24,6 @@ #include "runtime/datetime_value.h" #include "runtime/decimalv2_value.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/string_value.h" #include "util/bitmap_value.h" @@ -488,8 +487,6 @@ struct AggregateFuncTraitsdata = reinterpret_cast(hll); - mem_pool->mem_tracker()->Consume(hll->memory_consumed()); - agg_pool->add(hll); } @@ -534,7 +531,6 @@ struct AggregateFuncTraitssize = 0; auto bitmap = new BitmapValue(src_slice->data); - mem_pool->mem_tracker()->Consume(sizeof(BitmapValue)); dst_slice->data = (char*)bitmap; agg_pool->add(bitmap); diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index cc7c358d4739ac..7647527d1b1357 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -19,12 +19,12 @@ #include "util/doris_metrics.h" #include "util/trace.h" +#include "runtime/thread_context.h" namespace doris { -BaseCompaction::BaseCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker) - : Compaction(tablet, label, parent_tracker) {} +BaseCompaction::BaseCompaction(TabletSharedPtr tablet) + : Compaction(tablet, "BaseCompaction:" + std::to_string(tablet->tablet_id())) {} BaseCompaction::~BaseCompaction() {} diff --git a/be/src/olap/base_compaction.h b/be/src/olap/base_compaction.h index 54088ea48d7dfe..d4c2c2f360af79 100644 --- a/be/src/olap/base_compaction.h +++ b/be/src/olap/base_compaction.h @@ -29,8 +29,7 @@ namespace doris { class BaseCompaction : public Compaction { public: - BaseCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker); + BaseCompaction(TabletSharedPtr tablet); ~BaseCompaction() override; OLAPStatus prepare_compact() override; diff --git a/be/src/olap/bloom_filter_predicate.cpp b/be/src/olap/bloom_filter_predicate.cpp index 48127357181cff..834df7198994cf 100644 --- a/be/src/olap/bloom_filter_predicate.cpp +++ b/be/src/olap/bloom_filter_predicate.cpp @@ -41,19 +41,19 @@ ColumnPredicate* BloomFilterColumnPredicateFactory::create_column_predicate( switch (type) { #define M(NAME) \ case OLAP_FIELD_##NAME: { \ - filter.reset(create_bloom_filter(bloom_filter->tracker(), NAME)); \ + filter.reset(create_bloom_filter(NAME)); \ filter->light_copy(bloom_filter.get()); \ return new BloomFilterColumnPredicate(column_id, filter); \ } APPLY_FOR_PRIMTYPE(M) #undef M case OLAP_FIELD_TYPE_DECIMAL: { - filter.reset(create_bloom_filter(bloom_filter->tracker(), TYPE_DECIMALV2)); + filter.reset(create_bloom_filter(TYPE_DECIMALV2)); filter->light_copy(bloom_filter.get()); return new BloomFilterColumnPredicate(column_id, filter); } case OLAP_FIELD_TYPE_BOOL: { - filter.reset(create_bloom_filter(bloom_filter->tracker(), TYPE_BOOLEAN)); + filter.reset(create_bloom_filter(TYPE_BOOLEAN)); filter->light_copy(bloom_filter.get()); return new BloomFilterColumnPredicate(column_id, filter); } diff --git a/be/src/olap/collect_iterator.h b/be/src/olap/collect_iterator.h index e0dd44d2e3601f..2161ccf437806e 100644 --- a/be/src/olap/collect_iterator.h +++ b/be/src/olap/collect_iterator.h @@ -17,6 +17,8 @@ #pragma once +#include + #include "olap/olap_define.h" #include "olap/row_cursor.h" #include "olap/rowset/rowset_reader.h" diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 8576292a807c81..7b3239eed85e3f 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -19,6 +19,7 @@ #include "gutil/strings/substitute.h" #include "olap/rowset/rowset_factory.h" +#include "runtime/thread_context.h" #include "util/time.h" #include "util/trace.h" @@ -26,13 +27,15 @@ using std::vector; namespace doris { -Compaction::Compaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker) - : _mem_tracker(MemTracker::CreateTracker(-1, label, parent_tracker, true, false, MemTrackerLevel::TASK)), - _readers_tracker(MemTracker::CreateTracker(-1, "CompactionReaderTracker:" + std::to_string(tablet->tablet_id()), _mem_tracker, - true, false)), - _writer_tracker(MemTracker::CreateTracker(-1, "CompationWriterTracker:" + std::to_string(tablet->tablet_id()), _mem_tracker, - true, false)), +Compaction::Compaction(TabletSharedPtr tablet, const std::string& label) + : _mem_tracker( + MemTracker::create_tracker(-1, label, nullptr, MemTrackerLevel::INSTANCE)), + _readers_tracker(MemTracker::create_tracker( + -1, "CompactionReaderTracker:" + std::to_string(tablet->tablet_id()), + _mem_tracker)), + _writer_tracker(MemTracker::create_tracker( + -1, "CompationWriterTracker:" + std::to_string(tablet->tablet_id()), + _mem_tracker)), _tablet(tablet), _input_rowsets_size(0), _input_row_num(0), @@ -41,6 +44,7 @@ Compaction::Compaction(TabletSharedPtr tablet, const std::string& label, Compaction::~Compaction() {} OLAPStatus Compaction::compact() { + SCOPED_ATTACH_TASK_THREAD_2ARG(ThreadContext::TaskType::COMPACTION, _mem_tracker); RETURN_NOT_OK(prepare_compact()); RETURN_NOT_OK(execute_compact()); return OLAP_SUCCESS; @@ -141,7 +145,8 @@ OLAPStatus Compaction::do_compaction_impl(int64_t permits) { << ", output_version=" << _output_version << ", current_max_version=" << current_max_version << ", disk=" << _tablet->data_dir()->path() << ", segments=" << segments_num - << ". elapsed time=" << watch.get_elapse_second() << "s. cumulative_compaction_policy=" + << ". elapsed time=" << watch.get_elapse_second() + << "s. cumulative_compaction_policy=" << _tablet->cumulative_compaction_policy()->name() << "."; return OLAP_SUCCESS; @@ -163,7 +168,6 @@ OLAPStatus Compaction::construct_output_rowset_writer() { context.rowset_state = VISIBLE; context.version = _output_version; context.segments_overlap = NONOVERLAPPING; - context.parent_mem_tracker = _writer_tracker; // The test results show that one rs writer is low-memory-footprint, there is no need to tracker its mem pool RETURN_NOT_OK(RowsetFactory::create_rowset_writer(context, &_output_rs_writer)); return OLAP_SUCCESS; @@ -172,11 +176,7 @@ OLAPStatus Compaction::construct_output_rowset_writer() { OLAPStatus Compaction::construct_input_rowset_readers() { for (auto& rowset : _input_rowsets) { RowsetReaderSharedPtr rs_reader; - RETURN_NOT_OK(rowset->create_reader( - MemTracker::CreateTracker( - -1, "Compaction:RowsetReader:" + rowset->rowset_id().to_string(), - _readers_tracker, true, true), - &rs_reader)); + RETURN_NOT_OK(rowset->create_reader(&rs_reader)); _input_rs_readers.push_back(std::move(rs_reader)); } return OLAP_SUCCESS; @@ -295,4 +295,4 @@ int64_t Compaction::get_compaction_permits() { return permits; } -} // namespace doris +} // namespace doris diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index 6e7985a4d1c7c6..71dab8b3ff955e 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -44,8 +44,7 @@ class Merger; // 4. gc output rowset if failed class Compaction { public: - Compaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker); + Compaction(TabletSharedPtr tablet, const std::string& label); virtual ~Compaction(); // This is only for http CompactionAction @@ -84,6 +83,7 @@ class Compaction { // the root tracker for this compaction std::shared_ptr _mem_tracker; + // TODO(zxy) no used // the child of root, only track rowset readers mem std::shared_ptr _readers_tracker; diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index f987acdbe342ce..bce0148a834246 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -19,13 +19,13 @@ #include "util/doris_metrics.h" #include "util/time.h" +#include "runtime/thread_context.h" #include "util/trace.h" namespace doris { -CumulativeCompaction::CumulativeCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker) - : Compaction(tablet, label, parent_tracker) {} +CumulativeCompaction::CumulativeCompaction(TabletSharedPtr tablet) + : Compaction(tablet, "CumulativeCompaction:" + std::to_string(tablet->tablet_id())) {} CumulativeCompaction::~CumulativeCompaction() {} diff --git a/be/src/olap/cumulative_compaction.h b/be/src/olap/cumulative_compaction.h index c1d742de9f03d7..d7c26ed6699db6 100644 --- a/be/src/olap/cumulative_compaction.h +++ b/be/src/olap/cumulative_compaction.h @@ -27,8 +27,7 @@ namespace doris { class CumulativeCompaction : public Compaction { public: - CumulativeCompaction(TabletSharedPtr tablet, const std::string& label, - const std::shared_ptr& parent_tracker); + CumulativeCompaction(TabletSharedPtr tablet); ~CumulativeCompaction() override; OLAPStatus prepare_compact() override; diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index c90d017c917331..87664c01d30e20 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -25,18 +25,17 @@ #include "olap/schema_change.h" #include "olap/storage_engine.h" #include "runtime/row_batch.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" namespace doris { -OLAPStatus DeltaWriter::open(WriteRequest* req, const std::shared_ptr& parent, - DeltaWriter** writer) { - *writer = new DeltaWriter(req, parent, StorageEngine::instance()); +OLAPStatus DeltaWriter::open(WriteRequest* req, DeltaWriter** writer) { + *writer = new DeltaWriter(req, StorageEngine::instance()); return OLAP_SUCCESS; } -DeltaWriter::DeltaWriter(WriteRequest* req, const std::shared_ptr& parent, - StorageEngine* storage_engine) +DeltaWriter::DeltaWriter(WriteRequest* req, StorageEngine* storage_engine) : _req(*req), _tablet(nullptr), _cur_rowset(nullptr), @@ -45,8 +44,7 @@ DeltaWriter::DeltaWriter(WriteRequest* req, const std::shared_ptr& p _rowset_writer(nullptr), _tablet_schema(nullptr), _delta_written_success(false), - _storage_engine(storage_engine), - _parent_mem_tracker(parent) {} + _storage_engine(storage_engine) {} DeltaWriter::~DeltaWriter() { if (_is_init && !_delta_written_success) { @@ -105,8 +103,9 @@ OLAPStatus DeltaWriter::init() { return OLAP_ERR_TABLE_NOT_FOUND; } - _mem_tracker = MemTracker::CreateTracker(-1, "DeltaWriter:" + std::to_string(_tablet->tablet_id()), - _parent_mem_tracker); + _mem_tracker = + MemTracker::create_tracker(-1, "DeltaWriter:" + std::to_string(_tablet->tablet_id())); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // check tablet version number if (_tablet->version_count() > config::max_tablet_version_num) { LOG(WARNING) << "failed to init delta writer. version count: " << _tablet->version_count() @@ -142,7 +141,6 @@ OLAPStatus DeltaWriter::init() { writer_context.txn_id = _req.txn_id; writer_context.load_id = _req.load_id; writer_context.segments_overlap = OVERLAPPING; - writer_context.parent_mem_tracker = _mem_tracker; RETURN_NOT_OK(RowsetFactory::create_rowset_writer(writer_context, &_rowset_writer)); _tablet_schema = &(_tablet->tablet_schema()); @@ -162,6 +160,7 @@ OLAPStatus DeltaWriter::write(Tuple* tuple) { if (!_is_init && !_is_cancelled) { RETURN_NOT_OK(init()); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_is_cancelled) { // The writer may be cancelled at any time by other thread. @@ -189,6 +188,7 @@ OLAPStatus DeltaWriter::write(const RowBatch* row_batch, const std::vector& if (!_is_init && !_is_cancelled) { RETURN_NOT_OK(init()); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_is_cancelled) { return OLAP_ERR_ALREADY_CANCELLED; @@ -214,6 +214,7 @@ OLAPStatus DeltaWriter::_flush_memtable_async() { } OLAPStatus DeltaWriter::flush_memtable_and_wait(bool need_wait) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (!_is_init) { // This writer is not initialized before flushing. Do nothing @@ -222,7 +223,7 @@ OLAPStatus DeltaWriter::flush_memtable_and_wait(bool need_wait) { // and at that time, the writer may not be initialized yet and that is a normal case. return OLAP_SUCCESS; } - + if (_is_cancelled) { return OLAP_ERR_ALREADY_CANCELLED; } @@ -247,6 +248,7 @@ OLAPStatus DeltaWriter::flush_memtable_and_wait(bool need_wait) { } OLAPStatus DeltaWriter::wait_flush() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (!_is_init) { // return OLAP_SUCCESS instead of OLAP_ERR_ALREADY_CANCELLED for same reason @@ -262,8 +264,7 @@ OLAPStatus DeltaWriter::wait_flush() { void DeltaWriter::_reset_mem_table() { _mem_table.reset(new MemTable(_tablet->tablet_id(), _schema.get(), _tablet_schema, _req.slots, - _req.tuple_desc, _tablet->keys_type(), _rowset_writer.get(), - _mem_tracker)); + _req.tuple_desc, _tablet->keys_type(), _rowset_writer.get())); } OLAPStatus DeltaWriter::close() { @@ -276,6 +277,7 @@ OLAPStatus DeltaWriter::close() { // for this tablet when being closed. RETURN_NOT_OK(init()); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_is_cancelled) { return OLAP_ERR_ALREADY_CANCELLED; @@ -287,6 +289,7 @@ OLAPStatus DeltaWriter::close() { } OLAPStatus DeltaWriter::close_wait(google::protobuf::RepeatedPtrField* tablet_vec, bool is_broken) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); DCHECK(_is_init) << "delta writer is supposed be to initialized before close_wait() being called"; @@ -297,7 +300,6 @@ OLAPStatus DeltaWriter::close_wait(google::protobuf::RepeatedPtrFieldwait()); - DCHECK_EQ(_mem_tracker->consumption(), 0); // use rowset meta manager to save meta _cur_rowset = _rowset_writer->build(); @@ -351,12 +353,12 @@ OLAPStatus DeltaWriter::close_wait(google::protobuf::RepeatedPtrFieldget_stats(); VLOG_CRITICAL << "close delta writer for tablet: " << _tablet->tablet_id() - << ", load id: " << print_id(_req.load_id) - << ", stats: " << stat; + << ", load id: " << print_id(_req.load_id) << ", stats: " << stat; return OLAP_SUCCESS; } OLAPStatus DeltaWriter::cancel() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (!_is_init || _is_cancelled) { return OLAP_SUCCESS; @@ -366,7 +368,6 @@ OLAPStatus DeltaWriter::cancel() { // cancel and wait all memtables in flush queue to be finished _flush_token->cancel(); } - DCHECK_EQ(_mem_tracker->consumption(), 0); _is_cancelled = true; return OLAP_SUCCESS; } diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index b8db71320d6e53..3a7e5612160c46 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -55,8 +55,7 @@ struct WriteRequest { // This class is NOT thread-safe, external synchronization is required. class DeltaWriter { public: - static OLAPStatus open(WriteRequest* req, const std::shared_ptr& parent, - DeltaWriter** writer); + static OLAPStatus open(WriteRequest* req, DeltaWriter** writer); ~DeltaWriter(); @@ -91,8 +90,7 @@ class DeltaWriter { int64_t tablet_id() { return _tablet->tablet_id(); } private: - DeltaWriter(WriteRequest* req, const std::shared_ptr& parent, - StorageEngine* storage_engine); + DeltaWriter(WriteRequest* req, StorageEngine* storage_engine); // push a full memtable to flush executor OLAPStatus _flush_memtable_async(); @@ -117,7 +115,6 @@ class DeltaWriter { StorageEngine* _storage_engine; std::unique_ptr _flush_token; - std::shared_ptr _parent_mem_tracker; std::shared_ptr _mem_tracker; // The counter of number of segment flushed already. diff --git a/be/src/olap/fs/block_manager.h b/be/src/olap/fs/block_manager.h index 55be413cebe236..fd0b99ea8f8b32 100644 --- a/be/src/olap/fs/block_manager.h +++ b/be/src/olap/fs/block_manager.h @@ -30,7 +30,6 @@ namespace doris { class BlockId; class Env; -class MemTracker; class Slice; namespace fs { @@ -185,10 +184,6 @@ struct CreateBlockOptions { struct BlockManagerOptions { BlockManagerOptions() = default; - // The memory tracker under which all new memory trackers will be parented. - // If nullptr, new memory trackers will be parented to the root tracker. - std::shared_ptr parent_mem_tracker; - // If false, metrics will not be produced. bool enable_metric = false; diff --git a/be/src/olap/fs/file_block_manager.cpp b/be/src/olap/fs/file_block_manager.cpp index 8e54df99d6de07..72b0c43d374583 100644 --- a/be/src/olap/fs/file_block_manager.cpp +++ b/be/src/olap/fs/file_block_manager.cpp @@ -32,7 +32,6 @@ #include "olap/fs/block_id.h" #include "olap/fs/block_manager_metrics.h" #include "olap/storage_engine.h" -#include "runtime/mem_tracker.h" #include "util/doris_metrics.h" #include "util/file_cache.h" #include "util/metrics.h" @@ -367,9 +366,7 @@ Status FileReadableBlock::readv(uint64_t offset, const Slice* results, size_t re FileBlockManager::FileBlockManager(Env* env, BlockManagerOptions opts) : _env(DCHECK_NOTNULL(env)), - _opts(std::move(opts)), - _mem_tracker(MemTracker::CreateTracker(-1, "FileBlockManager", _opts.parent_mem_tracker, - false, false, MemTrackerLevel::OVERVIEW)) { + _opts(std::move(opts)) { if (_opts.enable_metric) { _metrics.reset(new internal::BlockManagerMetrics()); } diff --git a/be/src/olap/fs/file_block_manager.h b/be/src/olap/fs/file_block_manager.h index 118d61988030eb..f8bd96743d2d97 100644 --- a/be/src/olap/fs/file_block_manager.h +++ b/be/src/olap/fs/file_block_manager.h @@ -31,7 +31,6 @@ namespace doris { class BlockId; class Env; -class MemTracker; class RandomAccessFile; namespace fs { @@ -111,10 +110,6 @@ class FileBlockManager : public BlockManager { // May be null if instantiated without metrics. std::unique_ptr _metrics; - // Tracks memory consumption of any allocations numerous enough to be - // interesting. - std::shared_ptr _mem_tracker; - // DISALLOW_COPY_AND_ASSIGN(FileBlockManager); // Underlying cache instance. Caches opened files. diff --git a/be/src/olap/generic_iterators.cpp b/be/src/olap/generic_iterators.cpp index 1b8f176637ac96..a499f28ac6f14c 100644 --- a/be/src/olap/generic_iterators.cpp +++ b/be/src/olap/generic_iterators.cpp @@ -113,8 +113,7 @@ Status AutoIncrementIterator::next_batch(RowBlockV2* block) { // } class MergeIteratorContext { public: - MergeIteratorContext(RowwiseIterator* iter, std::shared_ptr parent) - : _iter(iter), _block(iter->schema(), 1024, std::move(parent)) {} + MergeIteratorContext(RowwiseIterator* iter) : _iter(iter), _block(iter->schema(), 1024) {} MergeIteratorContext(const MergeIteratorContext&) = delete; MergeIteratorContext(MergeIteratorContext&&) = delete; @@ -207,11 +206,10 @@ Status MergeIteratorContext::_load_next_block() { class MergeIterator : public RowwiseIterator { public: // MergeIterator takes the ownership of input iterators - MergeIterator(std::vector iters, std::shared_ptr parent, int sequence_id_idx) - : _origin_iters(std::move(iters)), _sequence_id_idx(sequence_id_idx), _merge_heap(MergeContextComparator(_sequence_id_idx)) { - // use for count the mem use of Block use in Merge - _mem_tracker = MemTracker::CreateTracker(-1, "MergeIterator", std::move(parent), false); - } + MergeIterator(std::vector iters, int sequence_id_idx) + : _origin_iters(std::move(iters)), + _sequence_id_idx(sequence_id_idx), + _merge_heap(MergeContextComparator(_sequence_id_idx)) {} ~MergeIterator() override { while (!_merge_heap.empty()) { @@ -245,7 +243,7 @@ class MergeIterator : public RowwiseIterator { if (cmp_res != 0) { return cmp_res > 0; } - + // Second: If sequence_id_idx != 0 means we need to compare sequence. sequence only use // in unique key. so keep reverse order of sequence id here if (sequence_id_idx != -1) { @@ -278,7 +276,7 @@ Status MergeIterator::init(const StorageReadOptions& opts) { _schema.reset(new Schema((*(_origin_iters.begin()))->schema())); for (auto iter : _origin_iters) { - std::unique_ptr ctx(new MergeIteratorContext(iter, _mem_tracker)); + std::unique_ptr ctx(new MergeIteratorContext(iter)); RETURN_IF_ERROR(ctx->init(opts)); if (!ctx->valid()) { continue; @@ -323,10 +321,7 @@ class UnionIterator : public RowwiseIterator { // Iterators' ownership it transfered to this class. // This class will delete all iterators when destructs // Client should not use iterators any more. - UnionIterator(std::vector &v, std::shared_ptr parent) - : _origin_iters(v.begin(), v.end()) { - _mem_tracker = MemTracker::CreateTracker(-1, "UnionIterator", parent, false); - } + UnionIterator(std::vector& v) : _origin_iters(v.begin(), v.end()) {} ~UnionIterator() override { std::for_each(_origin_iters.begin(), _origin_iters.end(), std::default_delete()); @@ -374,18 +369,18 @@ Status UnionIterator::next_batch(RowBlockV2* block) { return Status::EndOfFile("End of UnionIterator"); } -RowwiseIterator* new_merge_iterator(std::vector inputs, std::shared_ptr parent, int sequence_id_idx) { +RowwiseIterator* new_merge_iterator(std::vector inputs, int sequence_id_idx) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new MergeIterator(std::move(inputs), parent, sequence_id_idx); + return new MergeIterator(std::move(inputs), sequence_id_idx); } -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent) { +RowwiseIterator* new_union_iterator(std::vector& inputs) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new UnionIterator(inputs, parent); + return new UnionIterator(inputs); } RowwiseIterator* new_auto_increment_iterator(const Schema& schema, size_t num_rows) { diff --git a/be/src/olap/generic_iterators.h b/be/src/olap/generic_iterators.h index e8f4528885ae29..5ff287b8d7cd8c 100644 --- a/be/src/olap/generic_iterators.h +++ b/be/src/olap/generic_iterators.h @@ -25,14 +25,14 @@ namespace doris { // // Inputs iterators' ownership is taken by created merge iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_merge_iterator(std::vector inputs, std::shared_ptr parent, int sequence_id_idx); +RowwiseIterator* new_merge_iterator(std::vector inputs, int sequence_id_idx); // Create a union iterator for input iterators. Union iterator will read // input iterators one by one. // // Inputs iterators' ownership is taken by created union iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent); +RowwiseIterator* new_union_iterator(std::vector& inputs); // Create an auto increment iterator which returns num_rows data in format of schema. // This class aims to be used in unit test. diff --git a/be/src/olap/iterators.h b/be/src/olap/iterators.h index 4cdfc605788354..4609bd0a52ca8f 100644 --- a/be/src/olap/iterators.h +++ b/be/src/olap/iterators.h @@ -114,9 +114,6 @@ class RowwiseIterator { // Return the data id such as segment id, used for keep the insert order when do // merge sort in priority queue virtual uint64_t data_id() const { return 0; } - -protected: - std::shared_ptr _mem_tracker; }; } // namespace doris diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index ca73ab1d8ba1fc..78e856a18ce435 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -16,6 +16,7 @@ #include "olap/olap_index.h" #include "olap/row_block.h" #include "olap/utils.h" +#include "runtime/thread_context.h" #include "util/doris_metrics.h" using std::string; @@ -292,7 +293,8 @@ void LRUCache::_evict_one_entry(LRUHandle* e) { Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), - CachePriority priority) { + CachePriority priority, + std::shared_ptr source_mem_tracker) { size_t handle_size = sizeof(LRUHandle) - 1 + key.size(); LRUHandle* e = reinterpret_cast(malloc(handle_size)); e->value = value; @@ -318,6 +320,8 @@ Cache::Handle* LRUCache::insert(const CacheKey& key, uint32_t hash, void* value, // note that the cache might get larger than its capacity if not enough // space was freed auto old = _table.insert(e); + DCHECK(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->parent_task_mem_tracker() != nullptr); + source_mem_tracker->transfer_to(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(), charge); _usage += e->total_size; if (old != nullptr) { old->in_cache = false; @@ -438,12 +442,11 @@ uint32_t ShardedLRUCache::_shard(uint32_t hash) { return hash >> (32 - kNumShardBits); } -ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type, - std::shared_ptr parent) +ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type) : _name(name), _last_id(1), - _mem_tracker(MemTracker::CreateTracker(-1, name, parent, true, false, - MemTrackerLevel::OVERVIEW)) { + _mem_tracker(MemTracker::create_tracker(-1, name, nullptr, MemTrackerLevel::OVERVIEW)) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const size_t per_shard = (total_capacity + (kNumShards - 1)) / kNumShards; for (int s = 0; s < kNumShards; s++) { _shards[s] = new LRUCache(type); @@ -462,32 +465,38 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, } ShardedLRUCache::~ShardedLRUCache() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int s = 0; s < kNumShards; s++) { delete _shards[s]; } _entity->deregister_hook(_name); DorisMetrics::instance()->metric_registry()->deregister_entity(_entity); - _mem_tracker->Release(_mem_tracker->consumption()); } Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), CachePriority priority) { + std::shared_ptr source_mem_tracker = thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const uint32_t hash = _hash_slice(key); - return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority); + return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority, + source_mem_tracker); } Cache::Handle* ShardedLRUCache::lookup(const CacheKey& key) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const uint32_t hash = _hash_slice(key); return _shards[_shard(hash)]->lookup(key, hash); } void ShardedLRUCache::release(Handle* handle) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); LRUHandle* h = reinterpret_cast(handle); _shards[_shard(h->hash)]->release(handle); } void ShardedLRUCache::erase(const CacheKey& key) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const uint32_t hash = _hash_slice(key); _shards[_shard(hash)]->erase(key, hash); } @@ -506,6 +515,7 @@ uint64_t ShardedLRUCache::new_id() { } int64_t ShardedLRUCache::prune() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune(); @@ -514,6 +524,7 @@ int64_t ShardedLRUCache::prune() { } int64_t ShardedLRUCache::prune_if(CacheValuePredicate pred) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t num_prune = 0; for (int s = 0; s < kNumShards; s++) { num_prune += _shards[s]->prune_if(pred); @@ -540,18 +551,14 @@ void ShardedLRUCache::update_cache_metrics() const { usage_ratio->set_value(total_capacity == 0 ? 0 : ((double)total_usage / total_capacity)); hit_ratio->set_value(total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count)); - - _mem_tracker->Consume(total_usage - _mem_tracker->consumption()); } -Cache* new_lru_cache(const std::string& name, size_t capacity, - std::shared_ptr parent_tracker) { - return new ShardedLRUCache(name, capacity, LRUCacheType::SIZE, parent_tracker); +Cache* new_lru_cache(const std::string& name, size_t capacity) { + return new ShardedLRUCache(name, capacity, LRUCacheType::SIZE); } -Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type, - std::shared_ptr parent_tracker) { - return new ShardedLRUCache(name, capacity, type, parent_tracker); +Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type) { + return new ShardedLRUCache(name, capacity, type); } } // namespace doris diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index 2ea6bda38e2661..f0453016ce96c2 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -56,11 +56,9 @@ enum LRUCacheType { // Create a new cache with a specified name and a fixed SIZE capacity. // This implementation of Cache uses a least-recently-used eviction policy. -extern Cache* new_lru_cache(const std::string& name, size_t capacity, - std::shared_ptr parent_tracekr = nullptr); +extern Cache* new_lru_cache(const std::string& name, size_t capacity); -extern Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type, - std::shared_ptr parent_tracekr = nullptr); +extern Cache* new_typed_lru_cache(const std::string& name, size_t capacity, LRUCacheType type); class CacheKey { public: @@ -315,7 +313,7 @@ class LRUCache { // Like Cache methods, but with an extra "hash" parameter. Cache::Handle* insert(const CacheKey& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value), - CachePriority priority = CachePriority::NORMAL); + CachePriority priority = CachePriority::NORMAL, std::shared_ptr source_mem_tracker = nullptr); Cache::Handle* lookup(const CacheKey& key, uint32_t hash); void release(Cache::Handle* handle); void erase(const CacheKey& key, uint32_t hash); @@ -362,8 +360,7 @@ static const int kNumShards = 1 << kNumShardBits; class ShardedLRUCache : public Cache { public: - explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type, - std::shared_ptr parent); + explicit ShardedLRUCache(const std::string& name, size_t total_capacity, LRUCacheType type); // TODO(fdy): 析构时清除所有cache元素 virtual ~ShardedLRUCache(); virtual Handle* insert(const CacheKey& key, void* value, size_t charge, diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 2ec1ccbbc1edb5..2b347700aaaa0c 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -31,17 +31,16 @@ namespace doris { MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, - KeysType keys_type, RowsetWriter* rowset_writer, - const std::shared_ptr& parent_tracker) + KeysType keys_type, RowsetWriter* rowset_writer) : _tablet_id(tablet_id), _schema(schema), _tablet_schema(tablet_schema), _tuple_desc(tuple_desc), _slot_descs(slot_descs), _keys_type(keys_type), - _mem_tracker(MemTracker::CreateTracker(-1, "MemTable", parent_tracker)), - _buffer_mem_pool(new MemPool(_mem_tracker.get())), - _table_mem_pool(new MemPool(_mem_tracker.get())), + _mem_tracker(MemTracker::create_tracker(-1, "MemTable")), + _buffer_mem_pool(new MemPool(_mem_tracker)), + _table_mem_pool(new MemPool(_mem_tracker)), _schema_size(_schema->schema_size()), _rowset_writer(rowset_writer) { if (tablet_schema->sort_type() == SortType::ZORDER) { diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index fdb574cb1e80f3..eced6ffd06920f 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -40,12 +40,12 @@ class MemTable { public: MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema, const std::vector* slot_descs, TupleDescriptor* tuple_desc, - KeysType keys_type, RowsetWriter* rowset_writer, - const std::shared_ptr& parent_tracker); + KeysType keys_type, RowsetWriter* rowset_writer); ~MemTable(); int64_t tablet_id() const { return _tablet_id; } size_t memory_usage() const { return _mem_tracker->consumption(); } + std::shared_ptr mem_tracker() { return _mem_tracker; } void insert(const Tuple* tuple); /// Flush OLAPStatus flush(); diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index b63074d2822708..dce1674052236f 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -20,6 +20,7 @@ #include #include "olap/memtable.h" +#include "runtime/thread_context.h" #include "util/scoped_cleanup.h" #include "util/time.h" @@ -28,8 +29,7 @@ namespace doris { std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat) { os << "(flush time(ms)=" << stat.flush_time_ns / NANOS_PER_MILLIS << ", flush wait time(ms)=" << stat.flush_wait_time_ns / NANOS_PER_MILLIS - << ", flush count=" << stat.flush_count - << ", flush bytes: " << stat.flush_size_bytes + << ", flush count=" << stat.flush_count << ", flush bytes: " << stat.flush_size_bytes << ", flush disk bytes: " << stat.flush_disk_size_bytes << ")"; return os; } @@ -42,7 +42,8 @@ std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat) { OLAPStatus FlushToken::submit(const std::shared_ptr& memtable) { RETURN_NOT_OK(_flush_status.load()); int64_t submit_task_time = MonotonicNanos(); - _flush_token->submit_func(std::bind(&FlushToken::_flush_memtable, this, memtable, submit_task_time)); + _flush_token->submit_func( + std::bind(&FlushToken::_flush_memtable, this, memtable, submit_task_time)); return OLAP_SUCCESS; } @@ -56,6 +57,7 @@ OLAPStatus FlushToken::wait() { } void FlushToken::_flush_memtable(std::shared_ptr memtable, int64_t submit_task_time) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(memtable->mem_tracker()); _stats.flush_wait_time_ns += (MonotonicNanos() - submit_task_time); SCOPED_CLEANUP({ memtable.reset(); }); // If previous flush has failed, return directly @@ -71,9 +73,8 @@ void FlushToken::_flush_memtable(std::shared_ptr memtable, int64_t sub } VLOG_CRITICAL << "flush memtable cost: " << timer.elapsed_time() - << ", count: " << _stats.flush_count - << ", mem size: " << memtable->memory_usage() - << ", disk size: " << memtable->flush_size(); + << ", count: " << _stats.flush_count << ", mem size: " << memtable->memory_usage() + << ", disk size: " << memtable->flush_size(); _stats.flush_time_ns += timer.elapsed_time(); _stats.flush_count++; _stats.flush_size_bytes += memtable->memory_usage(); diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index ec332f0f6caf12..66dfaaf5974a79 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -48,9 +48,7 @@ OLAPStatus Merger::merge_rowsets(TabletSharedPtr tablet, ReaderType reader_type, "failed to init row cursor when merging rowsets of tablet " + tablet->full_name()); row_cursor.allocate_memory_for_string_type(tablet->tablet_schema()); - // TODO(yingchun): monitor - std::shared_ptr tracker(new MemTracker(-1)); - std::unique_ptr mem_pool(new MemPool(tracker.get())); + std::unique_ptr mem_pool(new MemPool("Merger:merge_rowsets")); // The following procedure would last for long time, half of one day, etc. int64_t output_rows = 0; diff --git a/be/src/olap/olap_index.cpp b/be/src/olap/olap_index.cpp index 14ae7e4434cb0d..85f0625889c74c 100644 --- a/be/src/olap/olap_index.cpp +++ b/be/src/olap/olap_index.cpp @@ -40,8 +40,7 @@ MemIndex::MemIndex() _index_size(0), _data_size(0), _num_rows(0), - _tracker(new MemTracker(-1)), - _mem_pool(new MemPool(_tracker.get())) {} + _mem_pool(new MemPool("MemIndex")) {} MemIndex::~MemIndex() { _num_entries = 0; diff --git a/be/src/olap/olap_index.h b/be/src/olap/olap_index.h index 1b9c704c41007f..11e22d2b67f89c 100644 --- a/be/src/olap/olap_index.h +++ b/be/src/olap/olap_index.h @@ -291,7 +291,6 @@ class MemIndex { size_t _num_rows; std::vector* _short_key_columns; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; DISALLOW_COPY_AND_ASSIGN(MemIndex); }; diff --git a/be/src/olap/page_cache.cpp b/be/src/olap/page_cache.cpp index 76dd0542a85f4e..65882ccbee3611 100644 --- a/be/src/olap/page_cache.cpp +++ b/be/src/olap/page_cache.cpp @@ -16,6 +16,7 @@ // under the License. #include "olap/page_cache.h" +#include "runtime/thread_context.h" namespace doris { @@ -29,20 +30,27 @@ void StoragePageCache::create_global_cache(size_t capacity, int32_t index_cache_ StoragePageCache::StoragePageCache(size_t capacity, int32_t index_cache_percentage) : _index_cache_percentage(index_cache_percentage), - _mem_tracker(MemTracker::CreateTracker(capacity, "StoragePageCache", nullptr, true, true, MemTrackerLevel::OVERVIEW)) { + _mem_tracker(MemTracker::create_tracker(capacity, "StoragePageCache", nullptr, + MemTrackerLevel::OVERVIEW)) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (index_cache_percentage == 0) { - _data_page_cache = std::unique_ptr(new_lru_cache("DataPageCache", capacity, _mem_tracker)); + _data_page_cache = + std::unique_ptr(new_lru_cache("DataPageCache", capacity)); } else if (index_cache_percentage == 100) { - _index_page_cache = std::unique_ptr(new_lru_cache("IndexPageCache", capacity, _mem_tracker)); + _index_page_cache = + std::unique_ptr(new_lru_cache("IndexPageCache", capacity)); } else if (index_cache_percentage > 0 && index_cache_percentage < 100) { - _data_page_cache = std::unique_ptr(new_lru_cache("DataPageCache", capacity * (100 - index_cache_percentage) / 100, _mem_tracker)); - _index_page_cache = std::unique_ptr(new_lru_cache("IndexPageCache", capacity * index_cache_percentage / 100, _mem_tracker)); + _data_page_cache = std::unique_ptr(new_lru_cache( + "DataPageCache", capacity * (100 - index_cache_percentage) / 100)); + _index_page_cache = std::unique_ptr(new_lru_cache( + "IndexPageCache", capacity * index_cache_percentage / 100)); } else { CHECK(false) << "invalid index page cache percentage"; } } -bool StoragePageCache::lookup(const CacheKey& key, PageCacheHandle* handle, segment_v2::PageTypePB page_type) { +bool StoragePageCache::lookup(const CacheKey& key, PageCacheHandle* handle, + segment_v2::PageTypePB page_type) { auto cache = _get_page_cache(page_type); auto lru_handle = cache->lookup(key.encode()); if (lru_handle == nullptr) { diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index ef50f5a3bac316..833d8822594ac6 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -903,9 +903,7 @@ OLAPStatus PushBrokerReader::init(const Schema* schema, const TBrokerScanRange& } _runtime_profile = _runtime_state->runtime_profile(); _runtime_profile->set_name("PushBrokerReader"); - _mem_tracker = MemTracker::CreateTracker(-1, "PushBrokerReader", - _runtime_state->instance_mem_tracker()); - _mem_pool.reset(new MemPool(_mem_tracker.get())); + _mem_pool.reset(new MemPool("PushBrokerReader")); _counter.reset(new ScannerCounter()); // init scanner diff --git a/be/src/olap/push_handler.h b/be/src/olap/push_handler.h index c1499b2858e36d..4c3d54a67795ea 100644 --- a/be/src/olap/push_handler.h +++ b/be/src/olap/push_handler.h @@ -211,7 +211,6 @@ class PushBrokerReader { const Schema* _schema; std::unique_ptr _runtime_state; RuntimeProfile* _runtime_profile; - std::shared_ptr _mem_tracker; std::unique_ptr _mem_pool; std::unique_ptr _counter; std::unique_ptr _scanner; diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index 4deda90eb0cc8c..e2aa2dc5d4e877 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -37,7 +37,6 @@ #include "olap/storage_engine.h" #include "olap/tablet.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/string_value.hpp" #include "util/date_func.h" #include "util/mem_util.hpp" @@ -107,9 +106,7 @@ TabletReader::~TabletReader() { } OLAPStatus TabletReader::init(const ReaderParams& read_params) { - // TODO(yingchun): monitor - _tracker.reset(new MemTracker(-1, read_params.tablet->full_name())); - _predicate_mem_pool.reset(new MemPool(_tracker.get())); + _predicate_mem_pool.reset(new MemPool("TabletReader:" + read_params.tablet->full_name())); OLAPStatus res = _init_params(read_params); if (res != OLAP_SUCCESS) { diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h index 3137e0612dd39e..de45f749c4ff13 100644 --- a/be/src/olap/reader.h +++ b/be/src/olap/reader.h @@ -185,7 +185,6 @@ class TabletReader { TabletSharedPtr tablet() { return _tablet; } - std::shared_ptr _tracker; std::unique_ptr _predicate_mem_pool; std::set _load_bf_columns; std::set _load_bf_all_columns; diff --git a/be/src/olap/row_block.cpp b/be/src/olap/row_block.cpp index d6f522093a60cd..061972edd3d372 100644 --- a/be/src/olap/row_block.cpp +++ b/be/src/olap/row_block.cpp @@ -37,10 +37,8 @@ using std::vector; namespace doris { -RowBlock::RowBlock(const TabletSchema* schema, const std::shared_ptr& parent_tracker) - : _capacity(0), _schema(schema) { - _tracker = MemTracker::CreateTracker(-1, "RowBlock", parent_tracker, true, true, MemTrackerLevel::VERBOSE); - _mem_pool.reset(new MemPool(_tracker.get())); +RowBlock::RowBlock(const TabletSchema* schema) : _capacity(0), _schema(schema) { + _mem_pool.reset(new MemPool("RowBlock")); } RowBlock::~RowBlock() { @@ -90,7 +88,8 @@ void RowBlock::_compute_layout() { // All field has a nullbyte in memory if (column.type() == OLAP_FIELD_TYPE_VARCHAR || column.type() == OLAP_FIELD_TYPE_HLL || - column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_OBJECT ||column.type() == OLAP_FIELD_TYPE_STRING) { + column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_OBJECT || + column.type() == OLAP_FIELD_TYPE_STRING) { // 变长部分额外计算下实际最大的字符串长度(此处length已经包括记录Length的2个字节) memory_size += sizeof(Slice) + sizeof(char); } else { diff --git a/be/src/olap/row_block.h b/be/src/olap/row_block.h index 75924fa63228de..6b1dd0255412cb 100644 --- a/be/src/olap/row_block.h +++ b/be/src/olap/row_block.h @@ -57,8 +57,7 @@ class RowBlock { friend class VectorizedRowBatch; public: - RowBlock(const TabletSchema* schema, - const std::shared_ptr& parent_tracker = nullptr); + RowBlock(const TabletSchema* schema); // 注意回收内部buffer ~RowBlock(); @@ -136,7 +135,6 @@ class RowBlock { size_t _limit = 0; uint8_t _block_status = DEL_PARTIAL_SATISFIED; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; // 由于内部持有内存资源,所以这里禁止拷贝和赋值 DISALLOW_COPY_AND_ASSIGN(RowBlock); diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp index dda02b335d71eb..ac067c7a51db7f 100644 --- a/be/src/olap/row_block2.cpp +++ b/be/src/olap/row_block2.cpp @@ -33,14 +33,10 @@ using strings::Substitute; namespace doris { RowBlockV2::RowBlockV2(const Schema& schema, uint16_t capacity) - : RowBlockV2(schema, capacity, nullptr) {} - -RowBlockV2::RowBlockV2(const Schema& schema, uint16_t capacity, std::shared_ptr parent) : _schema(schema), _capacity(capacity), _column_vector_batches(_schema.num_columns()), - _tracker(MemTracker::CreateTracker(-1, "RowBlockV2", std::move(parent))), - _pool(new MemPool(_tracker.get())), + _pool(new MemPool("RowBlockV2")), _selection_vector(nullptr) { for (auto cid : _schema.column_ids()) { Status status = ColumnVectorBatch::create( diff --git a/be/src/olap/row_block2.h b/be/src/olap/row_block2.h index 7f2b79d638e90c..35c4a48996addd 100644 --- a/be/src/olap/row_block2.h +++ b/be/src/olap/row_block2.h @@ -28,7 +28,6 @@ #include "olap/selection_vector.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -43,7 +42,6 @@ class RowBlockV2 { public: RowBlockV2(const Schema& schema, uint16_t capacity); - RowBlockV2(const Schema& schema, uint16_t capacity, std::shared_ptr parent); ~RowBlockV2(); // update number of rows contained in this block @@ -119,7 +117,6 @@ class RowBlockV2 { size_t _num_rows; // manages the memory for slice's data - std::shared_ptr _tracker; std::unique_ptr _pool; // index of selected rows for rows passed the predicate diff --git a/be/src/olap/rowset/alpha_rowset.cpp b/be/src/olap/rowset/alpha_rowset.cpp index 53c695f4d65056..9e3ff54092fab2 100644 --- a/be/src/olap/rowset/alpha_rowset.cpp +++ b/be/src/olap/rowset/alpha_rowset.cpp @@ -55,14 +55,6 @@ OLAPStatus AlphaRowset::create_reader(std::shared_ptr* result) { return OLAP_SUCCESS; } -OLAPStatus AlphaRowset::create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) { - result->reset(new AlphaRowsetReader(_schema->num_rows_per_row_block(), - std::static_pointer_cast(shared_from_this()), - parent_tracker)); - return OLAP_SUCCESS; -} - OLAPStatus AlphaRowset::remove() { VLOG_NOTICE << "begin to remove files in rowset " << unique_id() << ", version:" << start_version() << "-" << end_version() << ", tabletid:" << _rowset_meta->tablet_id(); diff --git a/be/src/olap/rowset/alpha_rowset.h b/be/src/olap/rowset/alpha_rowset.h index 364d59dc78fef6..8a5b4d9dc549f9 100644 --- a/be/src/olap/rowset/alpha_rowset.h +++ b/be/src/olap/rowset/alpha_rowset.h @@ -41,9 +41,6 @@ class AlphaRowset : public Rowset { OLAPStatus create_reader(std::shared_ptr* result) override; - OLAPStatus create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) override; - OLAPStatus split_range(const RowCursor& start_key, const RowCursor& end_key, uint64_t request_block_row_count, size_t key_num, std::vector* ranges) override; diff --git a/be/src/olap/rowset/alpha_rowset_reader.cpp b/be/src/olap/rowset/alpha_rowset_reader.cpp index b22bfa1fb59eb0..e883680f50bb15 100644 --- a/be/src/olap/rowset/alpha_rowset_reader.cpp +++ b/be/src/olap/rowset/alpha_rowset_reader.cpp @@ -22,11 +22,9 @@ namespace doris { -AlphaRowsetReader::AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset, - const std::shared_ptr& parent_tracker) +AlphaRowsetReader::AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset) : _num_rows_per_row_block(num_rows_per_row_block), _rowset(std::move(rowset)), - _parent_tracker(parent_tracker), _alpha_rowset_meta( std::static_pointer_cast(_rowset->rowset_meta()).get()), _segment_groups(_rowset->_segment_groups), @@ -68,8 +66,7 @@ OLAPStatus AlphaRowsetReader::init(RowsetReaderContext* read_context) { if (_current_read_context->need_ordered_result && _is_segments_overlapping && _sequential_ctxs.size() > 1) { _next_block = &AlphaRowsetReader::_merge_block; - _read_block.reset(new (std::nothrow) - RowBlock(_current_read_context->tablet_schema, _parent_tracker)); + _read_block.reset(new (std::nothrow) RowBlock(_current_read_context->tablet_schema)); if (_read_block == nullptr) { LOG(WARNING) << "new row block failed in reader"; return OLAP_ERR_MALLOC_ERROR; @@ -322,8 +319,7 @@ OLAPStatus AlphaRowsetReader::_init_merge_ctxs(RowsetReaderContext* read_context const bool use_index_stream_cache = read_context->reader_type == READER_QUERY; for (auto& segment_group : _segment_groups) { - std::unique_ptr new_column_data( - ColumnData::create(segment_group.get(), _parent_tracker)); + std::unique_ptr new_column_data(ColumnData::create(segment_group.get())); OLAPStatus status = new_column_data->init(); if (status != OLAP_SUCCESS) { LOG(WARNING) << "init column data failed"; diff --git a/be/src/olap/rowset/alpha_rowset_reader.h b/be/src/olap/rowset/alpha_rowset_reader.h index e76bb9465d44f5..018d78153c2f4a 100644 --- a/be/src/olap/rowset/alpha_rowset_reader.h +++ b/be/src/olap/rowset/alpha_rowset_reader.h @@ -52,8 +52,7 @@ struct AlphaMergeContextComparator { class AlphaRowsetReader : public RowsetReader { public: - AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset, - const std::shared_ptr& parent_tracker = nullptr); + AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset); ~AlphaRowsetReader() override; @@ -61,7 +60,6 @@ class AlphaRowsetReader : public RowsetReader { OLAPStatus init(RowsetReaderContext* read_context) override; // read next block data - // If parent_tracker is not null, the block we get from next_block() will have the parent_tracker. // It's ok, because we only get ref here, the block's owner is this reader. OLAPStatus next_block(RowBlock** block) override; @@ -104,7 +102,6 @@ class AlphaRowsetReader : public RowsetReader { private: int _num_rows_per_row_block; AlphaRowsetSharedPtr _rowset; - std::shared_ptr _parent_tracker; std::string _rowset_path; AlphaRowsetMeta* _alpha_rowset_meta; const std::vector>& _segment_groups; diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index 99ce9e5dcccfe5..fabcacfbab7716 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -74,14 +74,6 @@ OLAPStatus BetaRowset::create_reader(RowsetReaderSharedPtr* result) { return OLAP_SUCCESS; } -OLAPStatus BetaRowset::create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) { - // NOTE: We use std::static_pointer_cast for performance - result->reset(new BetaRowsetReader(std::static_pointer_cast(shared_from_this()), - parent_tracker)); - return OLAP_SUCCESS; -} - OLAPStatus BetaRowset::split_range(const RowCursor& start_key, const RowCursor& end_key, uint64_t request_block_row_count, size_t key_num, std::vector* ranges) { diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h index 5030b298e1770e..81ad134904766f 100644 --- a/be/src/olap/rowset/beta_rowset.h +++ b/be/src/olap/rowset/beta_rowset.h @@ -39,9 +39,6 @@ class BetaRowset : public Rowset { OLAPStatus create_reader(RowsetReaderSharedPtr* result) override; - OLAPStatus create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) override; - static FilePathDesc segment_file_path(const FilePathDesc& segment_dir_desc, const RowsetId& rowset_id, int segment_id); diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 3aed8eb3c37124..dd7e1586a58b84 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -31,21 +31,14 @@ namespace doris { -BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset, - std::shared_ptr parent_tracker) +BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset) : _context(nullptr), _rowset(std::move(rowset)), - _stats(&_owned_stats), - _parent_tracker(std::move(parent_tracker)) { + _stats(&_owned_stats) { _rowset->acquire(); } OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { - // If do not init the RowsetReader with a parent_tracker, use the runtime_state instance_mem_tracker - if (_parent_tracker == nullptr && read_context->runtime_state != nullptr) { - _parent_tracker = read_context->runtime_state->instance_mem_tracker(); - } - RETURN_NOT_OK(_rowset->load()); _context = read_context; if (_context->stats != nullptr) { @@ -102,7 +95,7 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { std::vector> seg_iterators; for (auto& seg_ptr : _segment_cache_handle.get_segments()) { std::unique_ptr iter; - auto s = seg_ptr->new_iterator(*_schema, read_options, _parent_tracker, &iter); + auto s = seg_ptr->new_iterator(*_schema, read_options, &iter); if (!s.ok()) { LOG(WARNING) << "failed to create iterator[" << seg_ptr->id() << "]: " << s.to_string(); return OLAP_ERR_ROWSET_READER_INIT; @@ -119,9 +112,9 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { // merge or union segment iterator RowwiseIterator* final_iterator; if (read_context->need_ordered_result && _rowset->rowset_meta()->is_segments_overlapping()) { - final_iterator = new_merge_iterator(iterators, _parent_tracker, read_context->sequence_id_idx); + final_iterator = new_merge_iterator(iterators, read_context->sequence_id_idx); } else { - final_iterator = new_union_iterator(iterators, _parent_tracker); + final_iterator = new_union_iterator(iterators); } auto s = final_iterator->init(read_options); if (!s.ok()) { @@ -132,11 +125,11 @@ OLAPStatus BetaRowsetReader::init(RowsetReaderContext* read_context) { // init input block _input_block.reset(new RowBlockV2(*_schema, - std::min(1024, read_context->batch_size), _parent_tracker)); + std::min(1024, read_context->batch_size))); if (!read_context->is_vec) { // init input/output block and row - _output_block.reset(new RowBlock(read_context->tablet_schema, _parent_tracker)); + _output_block.reset(new RowBlock(read_context->tablet_schema)); RowBlockInfo output_block_info; output_block_info.row_num = std::min(1024, read_context->batch_size); diff --git a/be/src/olap/rowset/beta_rowset_reader.h b/be/src/olap/rowset/beta_rowset_reader.h index 55a8938dbfb7f5..eeddef80da8653 100644 --- a/be/src/olap/rowset/beta_rowset_reader.h +++ b/be/src/olap/rowset/beta_rowset_reader.h @@ -30,14 +30,12 @@ namespace doris { class BetaRowsetReader : public RowsetReader { public: - BetaRowsetReader(BetaRowsetSharedPtr rowset, - std::shared_ptr parent_tracker = nullptr); + BetaRowsetReader(BetaRowsetSharedPtr rowset); ~BetaRowsetReader() override { _rowset->release(); } OLAPStatus init(RowsetReaderContext* read_context) override; - // If parent_tracker is not null, the block we get from next_block() will have the parent_tracker. // It's ok, because we only get ref here, the block's owner is this reader. OLAPStatus next_block(RowBlock** block) override; OLAPStatus next_block(vectorized::Block* block) override; @@ -63,8 +61,6 @@ class BetaRowsetReader : public RowsetReader { OlapReaderStatistics _owned_stats; OlapReaderStatistics* _stats; - std::shared_ptr _parent_tracker; - std::unique_ptr _iterator; std::unique_ptr _input_block; diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index be8713137a3dfa..8271ea232c1fef 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -223,8 +223,8 @@ OLAPStatus BetaRowsetWriter::_create_segment_writer(std::unique_ptrreset(new segment_v2::SegmentWriter(wblock.get(), _num_segment, - _context.tablet_schema, writer_options, _context.parent_mem_tracker)); + writer->reset(new segment_v2::SegmentWriter(wblock.get(), _num_segment, _context.tablet_schema, + writer_options)); { std::lock_guard l(_lock); _wblocks.push_back(std::move(wblock)); diff --git a/be/src/olap/rowset/column_data.cpp b/be/src/olap/rowset/column_data.cpp index 224367480c6f7c..178710603b726a 100644 --- a/be/src/olap/rowset/column_data.cpp +++ b/be/src/olap/rowset/column_data.cpp @@ -24,16 +24,13 @@ namespace doris { -ColumnData* ColumnData::create(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker) { - ColumnData* data = new (std::nothrow) ColumnData(segment_group, parent_tracker); +ColumnData* ColumnData::create(SegmentGroup* segment_group) { + ColumnData* data = new (std::nothrow) ColumnData(segment_group); return data; } -ColumnData::ColumnData(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker) +ColumnData::ColumnData(SegmentGroup* segment_group) : _segment_group(segment_group), - _parent_tracker(parent_tracker), _eof(false), _conditions(nullptr), _col_predicates(nullptr), @@ -138,7 +135,7 @@ OLAPStatus ColumnData::_seek_to_block(const RowBlockPosition& block_pos, bool wi _segment_reader = new (std::nothrow) SegmentReader(file_name, segment_group(), block_pos.segment, _seek_columns, _load_bf_columns, _conditions, _delete_handler, _delete_status, - _lru_cache, _runtime_state, _stats, _parent_tracker); + _lru_cache, _runtime_state, _stats); if (_segment_reader == nullptr) { OLAP_LOG_WARNING("fail to malloc segment reader."); return OLAP_ERR_MALLOC_ERROR; @@ -435,14 +432,12 @@ void ColumnData::set_read_params(const std::vector& return_columns, } _read_vector_batch.reset(new VectorizedRowBatch(&(_segment_group->get_tablet_schema()), - _return_columns, _num_rows_per_block, - _parent_tracker)); + _return_columns, _num_rows_per_block)); _seek_vector_batch.reset(new VectorizedRowBatch(&(_segment_group->get_tablet_schema()), - _seek_columns, _num_rows_per_block, - _parent_tracker)); + _seek_columns, _num_rows_per_block)); - _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()), _parent_tracker)); + _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()))); RowBlockInfo block_info; block_info.row_num = _num_rows_per_block; block_info.null_supported = true; @@ -580,7 +575,7 @@ OLAPStatus ColumnData::schema_change_init() { _read_vector_batch.reset(new VectorizedRowBatch(&(_segment_group->get_tablet_schema()), _return_columns, _num_rows_per_block)); - _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()), _parent_tracker)); + _read_block.reset(new RowBlock(&(_segment_group->get_tablet_schema()))); RowBlockInfo block_info; block_info.row_num = _num_rows_per_block; diff --git a/be/src/olap/rowset/column_data.h b/be/src/olap/rowset/column_data.h index e2565b4c4b84a8..c5ad7410195ae7 100644 --- a/be/src/olap/rowset/column_data.h +++ b/be/src/olap/rowset/column_data.h @@ -39,10 +39,8 @@ class SegmentReader; // This class is column data reader. this class will be used in two case. class ColumnData { public: - static ColumnData* create(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker = nullptr); - ColumnData(SegmentGroup* segment_group, - const std::shared_ptr& parent_tracker = nullptr); + static ColumnData* create(SegmentGroup* segment_group); + ColumnData(SegmentGroup* segment_group); ~ColumnData(); // 为了与之前兼容, 暴露部分index的接口 @@ -139,7 +137,6 @@ class ColumnData { private: SegmentGroup* _segment_group; - std::shared_ptr _parent_tracker; // 当到达文件末尾或者到达end key时设置此标志 bool _eof; const Conditions* _conditions; diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 8e952d48a6ab63..a6b533ef15588c 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -32,7 +32,6 @@ namespace doris { class DataDir; -class MemTracker; class OlapTuple; class RowCursor; class Rowset; @@ -119,10 +118,6 @@ class Rowset : public std::enable_shared_from_this { // returns OLAP_ERR_ROWSET_CREATE_READER when failed to create reader virtual OLAPStatus create_reader(std::shared_ptr* result) = 0; - // Support adding parent tracker, but should be careful about destruction sequence. - virtual OLAPStatus create_reader(const std::shared_ptr& parent_tracker, - std::shared_ptr* result) = 0; - // Split range denoted by `start_key` and `end_key` into sub-ranges, each contains roughly // `request_block_row_count` rows. Sub-range is represented by pair of OlapTuples and added to `ranges`. // diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index 8c314f5dba3cd6..74b65327ccc157 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -62,7 +62,6 @@ struct RowsetWriterContext { // indicate whether the data among segments is overlapping. // default is OVERLAP_UNKNOWN. SegmentsOverlapPB segments_overlap; - std::shared_ptr parent_mem_tracker; // segment file use uint32 to represent row number, therefore the maximum is UINT32_MAX. // the default is set to INT32_MAX to avoid overflow issue when casting from uint32_t to int. // test cases can change this value to control flush timing diff --git a/be/src/olap/rowset/segment_reader.cpp b/be/src/olap/rowset/segment_reader.cpp index 192caaa8789b44..da149b282b9309 100644 --- a/be/src/olap/rowset/segment_reader.cpp +++ b/be/src/olap/rowset/segment_reader.cpp @@ -25,6 +25,7 @@ #include "olap/in_stream.h" #include "olap/olap_cond.h" #include "olap/out_stream.h" +#include "runtime/thread_context.h" #include "olap/row_block.h" #include "olap/rowset/segment_group.h" @@ -37,8 +38,7 @@ SegmentReader::SegmentReader(const std::string file, SegmentGroup* segment_group const std::set& load_bf_columns, const Conditions* conditions, const DeleteHandler* delete_handler, const DelCondSatisfied delete_status, Cache* lru_cache, - RuntimeState* runtime_state, OlapReaderStatistics* stats, - const std::shared_ptr& parent_tracker) + RuntimeState* runtime_state, OlapReaderStatistics* stats) : _file_name(file), _segment_group(segment_group), _segment_id(segment_id), @@ -58,8 +58,7 @@ SegmentReader::SegmentReader(const std::string file, SegmentGroup* segment_group _is_using_mmap(false), _is_data_loaded(false), _buffer_size(0), - _tracker(MemTracker::CreateTracker(-1, "SegmentReader:" + file, parent_tracker, false)), - _mem_pool(new MemPool(_tracker.get())), + _mem_pool(new MemPool("SegmentReader:" + file)), _shared_buffer(nullptr), _lru_cache(lru_cache), _runtime_state(runtime_state), @@ -86,10 +85,6 @@ SegmentReader::~SegmentReader() { _lru_cache = nullptr; _file_handler.close(); - if (_is_data_loaded && _runtime_state != nullptr) { - MemTracker::update_limits(_buffer_size * -1, _runtime_state->mem_trackers()); - } - for (auto& it : _streams) { delete it.second; } @@ -237,6 +232,7 @@ OLAPStatus SegmentReader::seek_to_block(uint32_t first_block, uint32_t last_bloc if (!_is_data_loaded) { _reset_readers(); + if (!CHECK_MEM_LIMIT(_buffer_size)) return OLAP_ERR_FETCH_MEMORY_EXCEEDED; res = _read_all_data_streams(&_buffer_size); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to read data stream"); @@ -249,13 +245,6 @@ OLAPStatus SegmentReader::seek_to_block(uint32_t first_block, uint32_t last_bloc return res; } - if (_runtime_state != nullptr) { - MemTracker::update_limits(_buffer_size, _runtime_state->mem_trackers()); - if (MemTracker::limit_exceeded(_runtime_state->mem_trackers())) { - return OLAP_ERR_FETCH_MEMORY_EXCEEDED; - } - } - _is_data_loaded = true; } @@ -836,10 +825,6 @@ OLAPStatus SegmentReader::_reset_readers() { for (std::map::iterator it = _streams.begin(); it != _streams.end(); ++it) { - if (_runtime_state != nullptr) { - MemTracker::update_limits(-1 * it->second->get_buffer_size(), - _runtime_state->mem_trackers()); - } delete it->second; } @@ -850,10 +835,6 @@ OLAPStatus SegmentReader::_reset_readers() { if ((*it) == nullptr) { continue; } - if (_runtime_state != nullptr) { - MemTracker::update_limits(-1 * (*it)->get_buffer_size(), - _runtime_state->mem_trackers()); - } delete (*it); } diff --git a/be/src/olap/rowset/segment_reader.h b/be/src/olap/rowset/segment_reader.h index 0d3aef0b75f949..91464ac2ca5916 100644 --- a/be/src/olap/rowset/segment_reader.h +++ b/be/src/olap/rowset/segment_reader.h @@ -51,8 +51,7 @@ class SegmentReader { const std::vector& used_columns, const std::set& load_bf_columns, const Conditions* conditions, const DeleteHandler* delete_handler, const DelCondSatisfied delete_status, - Cache* lru_cache, RuntimeState* runtime_state, OlapReaderStatistics* stats, - const std::shared_ptr& parent_tracker = nullptr); + Cache* lru_cache, RuntimeState* runtime_state, OlapReaderStatistics* stats); ~SegmentReader(); @@ -317,7 +316,6 @@ class SegmentReader { std::vector _cache_handle; const FileHeader* _file_header; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; StorageByteBuffer* _shared_buffer; diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index f986fce59879a1..95ad47e4f1a9b8 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -37,8 +37,7 @@ BinaryDictPageBuilder::BinaryDictPageBuilder(const PageBuilderOptions& options) _data_page_builder(nullptr), _dict_builder(nullptr), _encoding_type(DICT_ENCODING), - _tracker(new MemTracker()), - _pool(_tracker.get()) { + _pool("BinaryDictPageBuilder") { // initially use DICT_ENCODING // TODO: the data page builder type can be created by Factory according to user config _data_page_builder.reset(new BitshufflePageBuilder(options)); diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index 6b9d23b80ad7bf..6ee2d404404f75 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -32,7 +32,6 @@ #include "olap/rowset/segment_v2/options.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "olap/rowset/segment_v2/bitshuffle_page.h" namespace doris { @@ -91,7 +90,6 @@ class BinaryDictPageBuilder : public PageBuilder { // used to remember the insertion order of dict keys std::vector _dict_items; // TODO(zc): rethink about this mem pool - std::shared_ptr _tracker; MemPool _pool; faststring _buffer; faststring _first_value; diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h index 4313169aa644ea..6007b587f10da0 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h @@ -25,7 +25,6 @@ #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/indexed_column_reader.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -71,8 +70,7 @@ class BitmapIndexIterator { _dict_column_iter(reader->_dict_column_reader.get()), _bitmap_column_iter(reader->_bitmap_column_reader.get()), _current_rowid(0), - _tracker(new MemTracker()), - _pool(new MemPool(_tracker.get())) {} + _pool(new MemPool("BitmapIndexIterator")) {} bool has_null_bitmap() const { return _reader->_has_null; } @@ -109,7 +107,6 @@ class BitmapIndexIterator { IndexedColumnIterator _dict_column_iter; IndexedColumnIterator _bitmap_column_iter; rowid_t _current_rowid; - std::shared_ptr _tracker; std::unique_ptr _pool; }; diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp index a81dc92dc56e59..73b582fa745ed7 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp @@ -26,7 +26,6 @@ #include "olap/rowset/segment_v2/indexed_column_writer.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/faststring.h" #include "util/slice.h" @@ -67,8 +66,7 @@ class BitmapIndexWriterImpl : public BitmapIndexWriter { explicit BitmapIndexWriterImpl(const TypeInfo* typeinfo) : _typeinfo(typeinfo), _reverted_index_size(0), - _tracker(new MemTracker()), - _pool(_tracker.get()) {} + _pool("BitmapIndexWriterImpl") {} ~BitmapIndexWriterImpl() = default; @@ -186,7 +184,6 @@ class BitmapIndexWriterImpl : public BitmapIndexWriter { roaring::Roaring _null_bitmap; // unique value to its row id list MemoryIndexType _mem_index; - std::shared_ptr _tracker; MemPool _pool; }; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h index bb9377eb922c74..28b952055ed5e9 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h @@ -27,7 +27,6 @@ #include "olap/rowset/segment_v2/indexed_column_reader.h" #include "olap/rowset/segment_v2/row_ranges.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { @@ -69,8 +68,7 @@ class BloomFilterIndexIterator { explicit BloomFilterIndexIterator(BloomFilterIndexReader* reader) : _reader(reader), _bloom_filter_iter(reader->_bloom_filter_reader.get()), - _tracker(new MemTracker()), - _pool(new MemPool(_tracker.get())) {} + _pool(new MemPool("BloomFilterIndexIterator")) {} // Read bloom filter at the given ordinal into `bf`. Status read_bloom_filter(rowid_t ordinal, std::unique_ptr* bf); @@ -80,7 +78,6 @@ class BloomFilterIndexIterator { private: BloomFilterIndexReader* _reader; IndexedColumnIterator _bloom_filter_iter; - std::shared_ptr _tracker; std::unique_ptr _pool; }; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index d45b2deb272754..3e1a204725ceae 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -28,7 +28,6 @@ #include "olap/rowset/segment_v2/indexed_column_writer.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/faststring.h" #include "util/slice.h" @@ -72,8 +71,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { const TypeInfo* typeinfo) : _bf_options(bf_options), _typeinfo(typeinfo), - _tracker(new MemTracker(-1, "BloomFilterIndexWriterImpl")), - _pool(_tracker.get()), + _pool("BloomFilterIndexWriterImpl"), _has_null(false), _bf_buffer_size(0) {} @@ -163,7 +161,6 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { private: BloomFilterOptions _bf_options; const TypeInfo* _typeinfo; - std::shared_ptr _tracker; MemPool _pool; bool _has_null; uint64_t _bf_buffer_size; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 9b14ff4201d4f7..85d9b77e72e579 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -457,9 +457,7 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, bool FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader) {} -FileColumnIterator::~FileColumnIterator() { - _opts.mem_tracker->Release(_opts.mem_tracker->consumption()); -} +FileColumnIterator::~FileColumnIterator() {} Status FileColumnIterator::seek_to_first() { RETURN_IF_ERROR(_reader->seek_to_first(&_page_iter)); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index db77577788d004..dc4fa49ec014c4 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -72,8 +72,6 @@ struct ColumnIteratorOptions { // INDEX_PAGE including index_page, dict_page and short_key_page PageTypePB type; - std::shared_ptr mem_tracker; - void sanity_check() const { CHECK_NOTNULL(rblock); CHECK_NOTNULL(stats); @@ -198,7 +196,6 @@ class ColumnIterator { virtual ~ColumnIterator() = default; virtual Status init(const ColumnIteratorOptions& opts) { - DCHECK(opts.mem_tracker.get() != nullptr); _opts = opts; return Status::OK(); } @@ -386,8 +383,7 @@ class DefaultValueColumnIterator : public ColumnIterator { _schema_length(schema_length), _is_default_value_null(false), _type_size(0), - _tracker(new MemTracker()), - _pool(new MemPool(_tracker.get())) {} + _pool(new MemPool("DefaultValueColumnIterator")) {} Status init(const ColumnIteratorOptions& opts) override; @@ -423,7 +419,6 @@ class DefaultValueColumnIterator : public ColumnIterator { bool _is_default_value_null; size_t _type_size; void* _mem_value = nullptr; - std::shared_ptr _tracker; std::unique_ptr _pool; // current rowid diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index b98f4883ca4e09..ebd5430ad81b33 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -59,7 +59,6 @@ struct ColumnWriterOptions { << ", need_bloom_filter" << need_bloom_filter; return ss.str(); } - std::shared_ptr parent = nullptr; }; class BitmapIndexWriter; @@ -142,9 +141,6 @@ class ColumnWriter { private: std::unique_ptr _field; bool _is_nullable; - -protected: - std::shared_ptr _mem_tracker; }; class FlushPageCallback { diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp index 088de6940ed6de..16586a24c576ed 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp @@ -41,8 +41,7 @@ IndexedColumnWriter::IndexedColumnWriter(const IndexedColumnWriterOptions& optio : _options(options), _typeinfo(typeinfo), _wblock(wblock), - _mem_tracker(new MemTracker()), - _mem_pool(_mem_tracker.get()), + _mem_pool("IndexedColumnWriter"), _num_values(0), _num_data_pages(0), _value_key_coder(nullptr), diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.h b/be/src/olap/rowset/segment_v2/indexed_column_writer.h index bcb27f434351d7..691440afdcf57d 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_writer.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.h @@ -27,7 +27,6 @@ #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/page_pointer.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/slice.h" namespace doris { @@ -91,7 +90,6 @@ class IndexedColumnWriter { const TypeInfo* _typeinfo; fs::WritableBlock* _wblock; // only used for `_first_value` - std::shared_ptr _mem_tracker; MemPool _mem_pool; ordinal_t _num_values; diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp b/be/src/olap/rowset/segment_v2/page_io.cpp index 739cde1597d0cb..fe7d3c32e860a8 100644 --- a/be/src/olap/rowset/segment_v2/page_io.cpp +++ b/be/src/olap/rowset/segment_v2/page_io.cpp @@ -47,7 +47,7 @@ Status PageIO::compress_page_body(const BlockCompressionCodec* codec, double min Slice compressed_slice(buf); RETURN_IF_ERROR(codec->compress(body, &compressed_slice)); buf.resize(compressed_slice.get_size()); - + double space_saving = 1.0 - static_cast(buf.size()) / uncompressed_size; // return compressed body only when it saves more than min_space_saving if (space_saving > 0 && space_saving >= min_space_saving) { @@ -116,8 +116,10 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* auto cache = StoragePageCache::instance(); PageCacheHandle cache_handle; - StoragePageCache::CacheKey cache_key(opts.rblock->path_desc().filepath, opts.page_pointer.offset); - if (opts.use_page_cache && cache->is_cache_available(opts.type) && cache->lookup(cache_key, &cache_handle, opts.type)) { + StoragePageCache::CacheKey cache_key(opts.rblock->path_desc().filepath, + opts.page_pointer.offset); + if (opts.use_page_cache && cache->is_cache_available(opts.type) && + cache->lookup(cache_key, &cache_handle, opts.type)) { // we find page in cache, use it *handle = PageHandle(std::move(cache_handle)); opts.stats->cached_pages_num++; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 470efc0a9ec765..377c561e26d2b7 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -47,18 +47,15 @@ Status Segment::open(const FilePathDesc& path_desc, uint32_t segment_id, const T return Status::OK(); } -Segment::Segment(const FilePathDesc& path_desc, uint32_t segment_id, const TabletSchema* tablet_schema) - : _path_desc(path_desc), _segment_id(segment_id), - _tablet_schema(tablet_schema) { -#ifndef BE_TEST - _mem_tracker = MemTracker::CreateTracker(-1, "Segment", StorageEngine::instance()->tablet_mem_tracker(), false); -#else - _mem_tracker = MemTracker::CreateTracker(-1, "Segment", nullptr, false); -#endif +Segment::Segment(const FilePathDesc& path_desc, uint32_t segment_id, + const TabletSchema* tablet_schema) + : _path_desc(path_desc), _segment_id(segment_id), _tablet_schema(tablet_schema) { + _mem_tracker = MemTracker::create_virtual_tracker( + -1, "Segment", StorageEngine::instance()->tablet_mem_tracker()); } Segment::~Segment() { - _mem_tracker->Release(_mem_tracker->consumption()); + _mem_tracker->release(_mem_tracker->consumption()); } Status Segment::_open() { @@ -69,7 +66,6 @@ Status Segment::_open() { } Status Segment::new_iterator(const Schema& schema, const StorageReadOptions& read_options, - std::shared_ptr parent, std::unique_ptr* iter) { if (!_is_open) { RETURN_IF_ERROR(_open()); @@ -94,7 +90,7 @@ Status Segment::new_iterator(const Schema& schema, const StorageReadOptions& rea } RETURN_IF_ERROR(_load_index()); - iter->reset(new SegmentIterator(this->shared_from_this(), schema, parent)); + iter->reset(new SegmentIterator(this->shared_from_this(), schema)); iter->get()->init(read_options); return Status::OK(); } @@ -128,7 +124,7 @@ Status Segment::_parse_footer() { return Status::Corruption(strings::Substitute("Bad segment file $0: file size $1 < $2", _path_desc.filepath, file_size, 12 + footer_length)); } - _mem_tracker->Consume(footer_length); + _mem_tracker->consume(footer_length); std::string footer_buf; footer_buf.resize(footer_length); @@ -172,7 +168,7 @@ Status Segment::_load_index() { DCHECK_EQ(footer.type(), SHORT_KEY_PAGE); DCHECK(footer.has_short_key_page_footer()); - _mem_tracker->Consume(body.get_size()); + _mem_tracker->consume(body.get_size()); _sk_index_decoder.reset(new ShortKeyIndexDecoder); return _sk_index_decoder->parse(body, footer.short_key_page_footer()); }); @@ -202,7 +198,7 @@ Status Segment::_create_column_readers() { return Status::OK(); } -Status Segment::new_column_iterator(uint32_t cid, std::shared_ptr parent, ColumnIterator** iter) { +Status Segment::new_column_iterator(uint32_t cid, ColumnIterator** iter) { if (_column_readers[cid] == nullptr) { const TabletColumn& tablet_column = _tablet_schema->column(cid); if (!tablet_column.has_default_value() && !tablet_column.is_nullable()) { @@ -214,7 +210,6 @@ Status Segment::new_column_iterator(uint32_t cid, std::shared_ptr pa tablet_column.has_default_value(), tablet_column.default_value(), tablet_column.is_nullable(), type_info, tablet_column.length())); ColumnIteratorOptions iter_opts; - iter_opts.mem_tracker = MemTracker::CreateTracker(-1, "DefaultColumnIterator", parent, false); RETURN_IF_ERROR(default_value_iter->init(iter_opts)); *iter = default_value_iter.release(); diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 56fc852d9864f6..857d6744705b96 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -64,15 +64,13 @@ class Segment : public std::enable_shared_from_this { ~Segment(); - Status new_iterator(const Schema& schema, const StorageReadOptions& read_options, - std::shared_ptr parent, - std::unique_ptr* iter); + Status new_iterator(const Schema& schema, const StorageReadOptions& read_options, std::unique_ptr* iter); uint64_t id() const { return _segment_id; } uint32_t num_rows() const { return _footer.num_rows(); } - Status new_column_iterator(uint32_t cid, std::shared_ptr parent, ColumnIterator** iter); + Status new_column_iterator(uint32_t cid, ColumnIterator** iter); Status new_bitmap_index_iterator(uint32_t cid, BitmapIndexIterator** iter); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index e5c7e0883f9d3b..891913e0725b3d 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -91,18 +91,14 @@ class SegmentIterator::BitmapRangeIterator { bool _eof = false; }; -SegmentIterator::SegmentIterator(std::shared_ptr segment, const Schema& schema, - std::shared_ptr parent) +SegmentIterator::SegmentIterator(std::shared_ptr segment, const Schema& schema) : _segment(std::move(segment)), _schema(schema), _column_iterators(_schema.num_columns(), nullptr), _bitmap_index_iterators(_schema.num_columns(), nullptr), _cur_rowid(0), _lazy_materialization_read(false), - _inited(false) { - // use for count the mem use of ColumnIterator - _mem_tracker = MemTracker::CreateTracker(-1, "SegmentIterator", std::move(parent), false); -} + _inited(false) {} SegmentIterator::~SegmentIterator() { for (auto iter : _column_iterators) { @@ -198,18 +194,16 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra } } _seek_schema = std::make_unique(key_fields, key_fields.size()); - _seek_block = std::make_unique(*_seek_schema, 1, _mem_tracker); + _seek_block = std::make_unique(*_seek_schema, 1); // create used column iterator for (auto cid : _seek_schema->column_ids()) { if (_column_iterators[cid] == nullptr) { RETURN_IF_ERROR( - _segment->new_column_iterator(cid, _mem_tracker, &_column_iterators[cid])); + _segment->new_column_iterator(cid, &_column_iterators[cid])); ColumnIteratorOptions iter_opts; iter_opts.stats = _opts.stats; iter_opts.rblock = _rblock.get(); - iter_opts.mem_tracker = - MemTracker::CreateTracker(-1, "ColumnIterator", _mem_tracker, false); RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts)); } } @@ -335,13 +329,11 @@ Status SegmentIterator::_init_return_column_iterators() { for (auto cid : _schema.column_ids()) { if (_column_iterators[cid] == nullptr) { RETURN_IF_ERROR( - _segment->new_column_iterator(cid, _mem_tracker, &_column_iterators[cid])); + _segment->new_column_iterator(cid, &_column_iterators[cid])); ColumnIteratorOptions iter_opts; iter_opts.stats = _opts.stats; iter_opts.use_page_cache = _opts.use_page_cache; iter_opts.rblock = _rblock.get(); - iter_opts.mem_tracker = - MemTracker::CreateTracker(-1, "ColumnIterator", _mem_tracker, false); RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts)); } } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 2eae13eb3422d0..a93ef74bc5f1fd 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -47,8 +47,7 @@ class ColumnIterator; class SegmentIterator : public RowwiseIterator { public: - SegmentIterator(std::shared_ptr segment, const Schema& _schema, - std::shared_ptr parent); + SegmentIterator(std::shared_ptr segment, const Schema& _schema); ~SegmentIterator() override; Status init(const StorageReadOptions& opts) override; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index adbfef96940a14..dd8b21985a103b 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -28,6 +28,7 @@ #include "olap/short_key_index.h" #include "runtime/mem_tracker.h" #include "util/crc32c.h" +#include "runtime/thread_context.h" #include "util/faststring.h" namespace doris { @@ -37,18 +38,22 @@ const char* k_segment_magic = "D0R1"; const uint32_t k_segment_magic_length = 4; SegmentWriter::SegmentWriter(fs::WritableBlock* wblock, uint32_t segment_id, - const TabletSchema* tablet_schema, const SegmentWriterOptions& opts, std::shared_ptr parent) - : _segment_id(segment_id), _tablet_schema(tablet_schema), _opts(opts), _wblock(wblock), _mem_tracker(MemTracker::CreateTracker( - -1, "Segment-" + std::to_string(segment_id), parent, false)) { + const TabletSchema* tablet_schema, const SegmentWriterOptions& opts) + : _segment_id(segment_id), + _tablet_schema(tablet_schema), + _opts(opts), + _wblock(wblock), + _mem_tracker( + MemTracker::create_virtual_tracker(-1, "SegmentWriter:Segment-" + std::to_string(segment_id))) { CHECK_NOTNULL(_wblock); } SegmentWriter::~SegmentWriter() { - _mem_tracker->Release(_mem_tracker->consumption()); + _mem_tracker->release(_mem_tracker->consumption()); }; void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t* column_id, - const TabletColumn& column) { + const TabletColumn& column) { // TODO(zc): Do we need this column_id?? meta->set_column_id((*column_id)++); meta->set_unique_id(column.unique_id()); @@ -85,7 +90,6 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec __attribute__((unused)) return Status::NotSupported("Do not support bitmap index for array type"); } } - opts.parent = _mem_tracker; std::unique_ptr writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _wblock, &writer)); @@ -129,7 +133,7 @@ uint64_t SegmentWriter::estimate_segment_size() { size += _index_builder->size(); // update the mem_tracker of segment size - _mem_tracker->Consume(size - _mem_tracker->consumption()); + _mem_tracker->consume(size - _mem_tracker->consumption()); return size; } @@ -218,7 +222,7 @@ Status SegmentWriter::_write_footer() { // that will need an extra seek when reading fixed_buf.append(k_segment_magic, k_segment_magic_length); - std::vector slices{footer_buf, fixed_buf}; + std::vector slices {footer_buf, fixed_buf}; return _write_raw_data(slices); } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index d0600996ad9292..77a66c85db3640 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -53,7 +53,7 @@ struct SegmentWriterOptions { class SegmentWriter { public: explicit SegmentWriter(fs::WritableBlock* block, uint32_t segment_id, - const TabletSchema* tablet_schema, const SegmentWriterOptions& opts, std::shared_ptr parent = nullptr); + const TabletSchema* tablet_schema, const SegmentWriterOptions& opts); ~SegmentWriter(); Status init(uint32_t write_mbytes_per_sec); diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.cpp b/be/src/olap/rowset/segment_v2/zone_map_index.cpp index ce29cfdb8130aa..e63df2bcc76c0e 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.cpp +++ b/be/src/olap/rowset/segment_v2/zone_map_index.cpp @@ -25,14 +25,13 @@ #include "olap/rowset/segment_v2/indexed_column_writer.h" #include "olap/types.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" namespace doris { namespace segment_v2 { ZoneMapIndexWriter::ZoneMapIndexWriter(Field* field) - : _field(field), _tracker(new MemTracker(-1, "ZoneMapIndexWriter")), _pool(_tracker.get()) { + : _field(field), _pool("ZoneMapIndexWriter") { _page_zone_map.min_value = _field->allocate_zone_map_value(&_pool); _page_zone_map.max_value = _field->allocate_zone_map_value(&_pool); _reset_zone_map(&_page_zone_map); @@ -129,8 +128,7 @@ Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory) { RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory)); IndexedColumnIterator iter(&reader); - auto tracker = std::make_shared(-1, "temp in ZoneMapIndexReader"); - MemPool pool(tracker.get()); + MemPool pool("ZoneMapIndexReader ColumnBlock"); _page_zone_maps.resize(reader.num_values()); // read and cache all page zone maps diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.h b/be/src/olap/rowset/segment_v2/zone_map_index.h index 0c129c5bd94ecd..f8ddfbb3525b03 100644 --- a/be/src/olap/rowset/segment_v2/zone_map_index.h +++ b/be/src/olap/rowset/segment_v2/zone_map_index.h @@ -27,7 +27,6 @@ #include "olap/field.h" #include "olap/rowset/segment_v2/binary_plain_page.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/slice.h" namespace doris { @@ -109,7 +108,6 @@ class ZoneMapIndexWriter { ZoneMap _segment_zone_map; // TODO(zc): we should replace this memory pool later, we only allocate min/max // for field. But MemPool allocate 4KB least, it will a waste for most cases. - std::shared_ptr _tracker; MemPool _pool; // serialized ZoneMapPB for each data page diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index e3c7d8b4bf8feb..d261faff0fc650 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -37,6 +37,7 @@ #include "runtime/exec_env.h" #include "runtime/mem_pool.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" using std::deque; @@ -56,9 +57,7 @@ class RowBlockSorter { public: explicit RowBlockSorter(RowBlockAllocator* allocator); virtual ~RowBlockSorter(); - size_t num_rows() { - return _swap_row_block != nullptr ? _swap_row_block->capacity() : 0; - } + size_t num_rows() { return _swap_row_block != nullptr ? _swap_row_block->capacity() : 0; } bool sort(RowBlock** row_block); @@ -78,7 +77,7 @@ class RowBlockMerger { virtual ~RowBlockMerger(); bool merge(const std::vector& row_block_arr, RowsetWriter* rowset_writer, - std::shared_ptr parent, uint64_t* merged_rows); + uint64_t* merged_rows); private: struct MergeElement { @@ -742,10 +741,9 @@ bool RowBlockSorter::sort(RowBlock** row_block) { return true; } -RowBlockAllocator::RowBlockAllocator(const TabletSchema& tablet_schema, - std::shared_ptr parent, size_t memory_limitation) +RowBlockAllocator::RowBlockAllocator(const TabletSchema& tablet_schema, size_t memory_limitation) : _tablet_schema(tablet_schema), - _mem_tracker(MemTracker::CreateTracker(-1, "RowBlockAllocator", parent, false)), + _mem_tracker(MemTracker::create_virtual_tracker(-1, "RowBlockAllocator")), _row_len(tablet_schema.row_size()), _memory_limitation(memory_limitation) { VLOG_NOTICE << "RowBlockAllocator(). row_len=" << _row_len; @@ -784,7 +782,7 @@ OLAPStatus RowBlockAllocator::allocate(RowBlock** row_block, size_t num_rows, bo row_block_info.null_supported = null_supported; (*row_block)->init(row_block_info); - _mem_tracker->Consume(row_block_size); + _mem_tracker->consume(row_block_size); VLOG_NOTICE << "RowBlockAllocator::allocate() this=" << this << ", num_rows=" << num_rows << ", m_memory_allocated=" << _mem_tracker->consumption() << ", row_block_addr=" << *row_block; @@ -797,7 +795,7 @@ void RowBlockAllocator::release(RowBlock* row_block) { return; } - _mem_tracker->Release(row_block->capacity() * _row_len); + _mem_tracker->release(row_block->capacity() * _row_len); VLOG_NOTICE << "RowBlockAllocator::release() this=" << this << ", num_rows=" << row_block->capacity() @@ -806,7 +804,7 @@ void RowBlockAllocator::release(RowBlock* row_block) { delete row_block; } -bool RowBlockAllocator::is_memory_enough_for_sorting(size_t num_rows, size_t allocated_rows){ +bool RowBlockAllocator::is_memory_enough_for_sorting(size_t num_rows, size_t allocated_rows) { if (num_rows <= allocated_rows) { return true; } @@ -814,18 +812,15 @@ bool RowBlockAllocator::is_memory_enough_for_sorting(size_t num_rows, size_t all return _mem_tracker->consumption() + row_block_size < _memory_limitation; } - RowBlockMerger::RowBlockMerger(TabletSharedPtr tablet) : _tablet(tablet) {} RowBlockMerger::~RowBlockMerger() {} bool RowBlockMerger::merge(const std::vector& row_block_arr, RowsetWriter* rowset_writer, - std::shared_ptr parent, uint64_t* merged_rows) { + uint64_t* merged_rows) { uint64_t tmp_merged_rows = 0; RowCursor row_cursor; - std::shared_ptr tracker( - MemTracker::CreateTracker(-1, "RowBlockMerger", parent, false)); - std::unique_ptr mem_pool(new MemPool(tracker.get())); + std::unique_ptr mem_pool(new MemPool("RowBlockMerger")); std::unique_ptr agg_object_pool(new ObjectPool()); if (row_cursor.init(_tablet->tablet_schema()) != OLAP_SUCCESS) { LOG(WARNING) << "fail to init row cursor."; @@ -933,32 +928,31 @@ bool RowBlockMerger::_pop_heap() { OLAPStatus LinkedSchemaChange::process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* new_rowset_writer, TabletSharedPtr new_tablet, TabletSharedPtr base_tablet) { - // In some cases, there may be more than one type of rowset in a tablet, // in which case the conversion cannot be done directly by linked schema change, // but requires direct schema change to rewrite the data. if (rowset_reader->type() != new_rowset_writer->type()) { - LOG(INFO) << "the type of rowset " << rowset_reader->rowset()->rowset_id() << " in base tablet " << base_tablet->tablet_id() - << " is not same as type " << new_rowset_writer->type() << ", use direct schema change."; - SchemaChangeDirectly scd(_row_block_changer, _mem_tracker); + LOG(INFO) << "the type of rowset " << rowset_reader->rowset()->rowset_id() + << " in base tablet " << base_tablet->tablet_id() << " is not same as type " + << new_rowset_writer->type() << ", use direct schema change."; + SchemaChangeDirectly scd(_row_block_changer); return scd.process(rowset_reader, new_rowset_writer, new_tablet, base_tablet); } else { OLAPStatus status = new_rowset_writer->add_rowset_for_linked_schema_change( rowset_reader->rowset(), _row_block_changer.get_schema_mapping()); if (status != OLAP_SUCCESS) { LOG(WARNING) << "fail to convert rowset." - << ", new_tablet=" << new_tablet->full_name() - << ", base_tablet=" << base_tablet->full_name() - << ", version=" << new_rowset_writer->version().first << "-" - << new_rowset_writer->version().second; + << ", new_tablet=" << new_tablet->full_name() + << ", base_tablet=" << base_tablet->full_name() + << ", version=" << new_rowset_writer->version().first << "-" + << new_rowset_writer->version().second; } return status; } } -SchemaChangeDirectly::SchemaChangeDirectly(const RowBlockChanger& row_block_changer, - std::shared_ptr mem_tracker) - : SchemaChange(mem_tracker), +SchemaChangeDirectly::SchemaChangeDirectly(const RowBlockChanger& row_block_changer) + : SchemaChange(), _row_block_changer(row_block_changer), _row_block_allocator(nullptr), _cursor(nullptr) {} @@ -1003,7 +997,7 @@ OLAPStatus SchemaChangeDirectly::process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* rowset_writer, TabletSharedPtr new_tablet, TabletSharedPtr base_tablet) { if (_row_block_allocator == nullptr) { - _row_block_allocator = new RowBlockAllocator(new_tablet->tablet_schema(), _mem_tracker, 0); + _row_block_allocator = new RowBlockAllocator(new_tablet->tablet_schema(), 0); if (_row_block_allocator == nullptr) { LOG(FATAL) << "failed to malloc RowBlockAllocator. size=" << sizeof(RowBlockAllocator); return OLAP_ERR_INPUT_PARAMETER_ERROR; @@ -1116,9 +1110,8 @@ OLAPStatus SchemaChangeDirectly::process(RowsetReaderSharedPtr rowset_reader, } SchemaChangeWithSorting::SchemaChangeWithSorting(const RowBlockChanger& row_block_changer, - std::shared_ptr mem_tracker, size_t memory_limitation) - : SchemaChange(mem_tracker), + : SchemaChange(), _row_block_changer(row_block_changer), _memory_limitation(memory_limitation), _row_block_allocator(nullptr) { @@ -1139,8 +1132,8 @@ OLAPStatus SchemaChangeWithSorting::process(RowsetReaderSharedPtr rowset_reader, TabletSharedPtr new_tablet, TabletSharedPtr base_tablet) { if (_row_block_allocator == nullptr) { - _row_block_allocator = new (nothrow) - RowBlockAllocator(new_tablet->tablet_schema(), _mem_tracker, _memory_limitation); + _row_block_allocator = + new (nothrow) RowBlockAllocator(new_tablet->tablet_schema(), _memory_limitation); if (_row_block_allocator == nullptr) { LOG(FATAL) << "failed to malloc RowBlockAllocator. size=" << sizeof(RowBlockAllocator); return OLAP_ERR_INPUT_PARAMETER_ERROR; @@ -1175,7 +1168,7 @@ OLAPStatus SchemaChangeWithSorting::process(RowsetReaderSharedPtr rowset_reader, // src_rowsets to store the rowset generated by internal sorting std::vector src_rowsets; - Defer defer{[&]() { + Defer defer {[&]() { // remove the intermediate rowsets generated by internal sorting for (auto& row_set : src_rowsets) { StorageEngine::instance()->add_unused_rowset(row_set); @@ -1209,10 +1202,10 @@ OLAPStatus SchemaChangeWithSorting::process(RowsetReaderSharedPtr rowset_reader, LOG(WARNING) << "failed to allocate RowBlock."; return OLAP_ERR_INPUT_PARAMETER_ERROR; } else { - // do memory check for sorting, in case schema change task fail at row block sorting because of + // do memory check for sorting, in case schema change task fail at row block sorting because of // not doing internal sorting first - if (!_row_block_allocator->is_memory_enough_for_sorting(ref_row_block->row_block_info().row_num, - row_block_sorter.num_rows())) { + if (!_row_block_allocator->is_memory_enough_for_sorting( + ref_row_block->row_block_info().row_num, row_block_sorter.num_rows())) { if (new_row_block != nullptr) { _row_block_allocator->release(new_row_block); new_row_block = nullptr; @@ -1368,7 +1361,6 @@ bool SchemaChangeWithSorting::_internal_sorting(const std::vector& ro context.rowset_state = VISIBLE; context.version = version; context.segments_overlap = segments_overlap; - context.parent_mem_tracker = _mem_tracker; VLOG_NOTICE << "init rowset builder. tablet=" << new_tablet->full_name() << ", block_row_size=" << new_tablet->num_rows_per_row_block(); @@ -1378,7 +1370,7 @@ bool SchemaChangeWithSorting::_internal_sorting(const std::vector& ro return false; } - if (!merger.merge(row_block_arr, rowset_writer.get(), _mem_tracker, &merged_rows)) { + if (!merger.merge(row_block_arr, rowset_writer.get(), &merged_rows)) { LOG(WARNING) << "failed to merge row blocks."; new_tablet->data_dir()->remove_pending_ids(ROWSET_ID_PREFIX + rowset_writer->rowset_id().to_string()); @@ -1397,7 +1389,7 @@ bool SchemaChangeWithSorting::_external_sorting(vector& src_row std::vector rs_readers; for (auto& rowset : src_rowsets) { RowsetReaderSharedPtr rs_reader; - auto res = rowset->create_reader(_mem_tracker, &rs_reader); + auto res = rowset->create_reader(&rs_reader); if (res != OLAP_SUCCESS) { LOG(WARNING) << "failed to create rowset reader."; return false; @@ -1420,7 +1412,7 @@ bool SchemaChangeWithSorting::_external_sorting(vector& src_row } SchemaChangeHandler::SchemaChangeHandler() - : _mem_tracker(MemTracker::CreateTracker(-1, "SchemaChange", StorageEngine::instance()->schema_change_mem_tracker())) { + : _mem_tracker(MemTracker::create_tracker(-1, "SchemaChangeHandler")) { REGISTER_HOOK_METRIC(schema_change_mem_consumption, [this]() { return _mem_tracker->consumption(); }); } @@ -1430,6 +1422,7 @@ SchemaChangeHandler::~SchemaChangeHandler() { } OLAPStatus SchemaChangeHandler::process_alter_tablet_v2(const TAlterTabletReqV2& request) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); LOG(INFO) << "begin to do request alter tablet: base_tablet_id=" << request.base_tablet_id << ", base_schema_hash=" << request.base_schema_hash << ", new_tablet_id=" << request.new_tablet_id @@ -1532,9 +1525,6 @@ OLAPStatus SchemaChangeHandler::_do_process_alter_tablet_v2(const TAlterTabletRe reader_context.seek_columns = &return_columns; reader_context.sequence_id_idx = reader_context.tablet_schema->sequence_col_idx(); - auto mem_tracker = MemTracker::CreateTracker(-1, "AlterTablet:" + std::to_string(base_tablet->tablet_id()) + "-" - + std::to_string(new_tablet->tablet_id()), _mem_tracker, true, false, MemTrackerLevel::TASK); - do { // get history data to be converted and it will check if there is hold in base tablet res = _get_versions_to_be_changed(base_tablet, &versions_to_be_changed); @@ -1596,7 +1586,7 @@ OLAPStatus SchemaChangeHandler::_do_process_alter_tablet_v2(const TAlterTabletRe } // acquire data sources correspond to history versions - base_tablet->capture_rs_readers(versions_to_be_changed, &rs_readers, mem_tracker); + base_tablet->capture_rs_readers(versions_to_be_changed, &rs_readers); if (rs_readers.size() < 1) { LOG(WARNING) << "fail to acquire all data sources. " << "version_num=" << versions_to_be_changed.size() @@ -1694,6 +1684,7 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl TabletSharedPtr new_tablet, RowsetSharedPtr* base_rowset, RowsetSharedPtr* new_rowset) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = OLAP_SUCCESS; LOG(INFO) << "begin to convert delta version for schema changing. " << "base_tablet=" << base_tablet->full_name() @@ -1720,14 +1711,14 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl size_t memory_limitation = config::memory_limitation_per_thread_for_schema_change; LOG(INFO) << "doing schema change with sorting for base_tablet " << base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeWithSorting( - rb_changer, _mem_tracker, memory_limitation * 1024 * 1024 * 1024); + sc_procedure = new (nothrow) + SchemaChangeWithSorting(rb_changer, memory_limitation * 1024 * 1024 * 1024); } else if (sc_directly) { LOG(INFO) << "doing schema change directly for base_tablet " << base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer); } else { LOG(INFO) << "doing linked schema change for base_tablet " << base_tablet->full_name(); - sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer); } if (sc_procedure == nullptr) { @@ -1754,7 +1745,7 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl reader_context.sequence_id_idx = reader_context.tablet_schema->sequence_col_idx(); RowsetReaderSharedPtr rowset_reader; - RETURN_NOT_OK((*base_rowset)->create_reader(_mem_tracker, &rowset_reader)); + RETURN_NOT_OK((*base_rowset)->create_reader(&rowset_reader)); RETURN_NOT_OK(rowset_reader->init(&reader_context)); RowsetWriterContext writer_context; @@ -1774,7 +1765,6 @@ OLAPStatus SchemaChangeHandler::schema_version_convert(TabletSharedPtr base_tabl writer_context.load_id.set_hi((*base_rowset)->load_id().hi()); writer_context.load_id.set_lo((*base_rowset)->load_id().lo()); writer_context.segments_overlap = (*base_rowset)->rowset_meta()->segments_overlap(); - writer_context.parent_mem_tracker = _mem_tracker; std::unique_ptr rowset_writer; RowsetFactory::create_rowset_writer(writer_context, &rowset_writer); @@ -1872,16 +1862,16 @@ OLAPStatus SchemaChangeHandler::_convert_historical_rowsets(const SchemaChangePa size_t memory_limitation = config::memory_limitation_per_thread_for_schema_change; LOG(INFO) << "doing schema change with sorting for base_tablet " << sc_params.base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeWithSorting( - rb_changer, _mem_tracker, memory_limitation * 1024 * 1024 * 1024); + sc_procedure = new (nothrow) + SchemaChangeWithSorting(rb_changer, memory_limitation * 1024 * 1024 * 1024); } else if (sc_directly) { LOG(INFO) << "doing schema change directly for base_tablet " << sc_params.base_tablet->full_name(); - sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) SchemaChangeDirectly(rb_changer); } else { LOG(INFO) << "doing linked schema change for base_tablet " << sc_params.base_tablet->full_name(); - sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer, _mem_tracker); + sc_procedure = new (nothrow) LinkedSchemaChange(rb_changer); } if (sc_procedure == nullptr) { @@ -1919,7 +1909,6 @@ OLAPStatus SchemaChangeHandler::_convert_historical_rowsets(const SchemaChangePa writer_context.rowset_state = VISIBLE; writer_context.version = rs_reader->version(); writer_context.segments_overlap = rs_reader->rowset()->rowset_meta()->segments_overlap(); - writer_context.parent_mem_tracker = _mem_tracker; std::unique_ptr rowset_writer; OLAPStatus status = RowsetFactory::create_rowset_writer(writer_context, &rowset_writer); diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h index 53aa34934a5aec..4312e2cc91b4c5 100644 --- a/be/src/olap/schema_change.h +++ b/be/src/olap/schema_change.h @@ -76,7 +76,7 @@ class RowBlockChanger { class RowBlockAllocator { public: - RowBlockAllocator(const TabletSchema& tablet_schema, std::shared_ptr parent, size_t memory_limitation); + RowBlockAllocator(const TabletSchema& tablet_schema, size_t memory_limitation); virtual ~RowBlockAllocator(); OLAPStatus allocate(RowBlock** row_block, size_t num_rows, bool null_supported); @@ -93,7 +93,7 @@ class RowBlockAllocator { class SchemaChange { public: - SchemaChange(std::shared_ptr tracker) : _mem_tracker(std::move(tracker)), _filtered_rows(0), _merged_rows(0) {} + SchemaChange() : _filtered_rows(0), _merged_rows(0) {} virtual ~SchemaChange() = default; virtual OLAPStatus process(RowsetReaderSharedPtr rowset_reader, @@ -111,8 +111,7 @@ class SchemaChange { void reset_filtered_rows() { _filtered_rows = 0; } void reset_merged_rows() { _merged_rows = 0; } -protected: - std::shared_ptr _mem_tracker; + private: uint64_t _filtered_rows; uint64_t _merged_rows; @@ -120,8 +119,8 @@ class SchemaChange { class LinkedSchemaChange : public SchemaChange { public: - explicit LinkedSchemaChange(const RowBlockChanger& row_block_changer, std::shared_ptr mem_tracker) - : SchemaChange(mem_tracker), _row_block_changer(row_block_changer) {} + explicit LinkedSchemaChange(const RowBlockChanger& row_block_changer) + : SchemaChange(), _row_block_changer(row_block_changer) {} ~LinkedSchemaChange() {} virtual OLAPStatus process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* new_rowset_writer, @@ -137,7 +136,7 @@ class SchemaChangeDirectly : public SchemaChange { public: // @params tablet the instance of tablet which has new schema. // @params row_block_changer changer to modify the data of RowBlock - explicit SchemaChangeDirectly(const RowBlockChanger& row_block_changer, std::shared_ptr mem_tracker); + explicit SchemaChangeDirectly(const RowBlockChanger& row_block_changer); virtual ~SchemaChangeDirectly(); virtual OLAPStatus process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* new_rowset_writer, @@ -156,7 +155,7 @@ class SchemaChangeDirectly : public SchemaChange { // @breif schema change with sorting class SchemaChangeWithSorting : public SchemaChange { public: - explicit SchemaChangeWithSorting(const RowBlockChanger& row_block_changer, std::shared_ptr mem_tracker, + explicit SchemaChangeWithSorting(const RowBlockChanger& row_block_changer, size_t memory_limitation); virtual ~SchemaChangeWithSorting(); @@ -237,6 +236,7 @@ class SchemaChangeHandler { static OLAPStatus _init_column_mapping(ColumnMapping* column_mapping, const TabletColumn& column_schema, const std::string& value); + private: SchemaChangeHandler(); virtual ~SchemaChangeHandler(); diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index 198b4b41543c34..2c3a95689b69bb 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -30,11 +30,9 @@ void SegmentLoader::create_global_instance(size_t capacity) { _s_instance = &instance; } -SegmentLoader::SegmentLoader(size_t capacity) - : _mem_tracker(MemTracker::CreateTracker(capacity, "SegmentLoader", nullptr, true, true, - MemTrackerLevel::OVERVIEW)) { +SegmentLoader::SegmentLoader(size_t capacity) { _cache = std::unique_ptr( - new_typed_lru_cache("SegmentCache", capacity, LRUCacheType::NUMBER, _mem_tracker)); + new_typed_lru_cache("SegmentLoader:SegmentCache", capacity, LRUCacheType::NUMBER)); } bool SegmentLoader::_lookup(const SegmentLoader::CacheKey& key, SegmentCacheHandle* handle) { diff --git a/be/src/olap/segment_loader.h b/be/src/olap/segment_loader.h index 2a75efa544c2e2..30cfce304d5d28 100644 --- a/be/src/olap/segment_loader.h +++ b/be/src/olap/segment_loader.h @@ -25,7 +25,6 @@ #include "olap/lru_cache.h" #include "olap/olap_common.h" // for rowset id #include "olap/rowset/beta_rowset.h" -#include "runtime/mem_tracker.h" #include "util/time.h" namespace doris { @@ -107,7 +106,6 @@ class SegmentLoader { static SegmentLoader* _s_instance; // A LRU cache to cache all opened segments std::unique_ptr _cache = nullptr; - std::shared_ptr _mem_tracker = nullptr; }; // A handle for a single rowset from segment lru cache. diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index 80816d71c7be2e..4baa4a47910004 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -34,6 +34,7 @@ #include "olap/rowset/rowset_converter.h" #include "olap/rowset/rowset_factory.h" #include "olap/rowset/rowset_id_generator.h" +#include "runtime/thread_context.h" #include "olap/rowset/rowset_writer.h" #include "olap/storage_engine.h" @@ -63,6 +64,7 @@ SnapshotManager* SnapshotManager::instance() { OLAPStatus SnapshotManager::make_snapshot(const TSnapshotRequest& request, string* snapshot_path, bool* allow_incremental_clone) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = OLAP_SUCCESS; if (snapshot_path == nullptr) { LOG(WARNING) << "output parameter cannot be null"; @@ -92,6 +94,7 @@ OLAPStatus SnapshotManager::make_snapshot(const TSnapshotRequest& request, strin OLAPStatus SnapshotManager::release_snapshot(const string& snapshot_path) { // 如果请求的snapshot_path位于root/snapshot文件夹下,则认为是合法的,可以删除 // 否则认为是非法请求,返回错误结果 + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); auto stores = StorageEngine::instance()->get_stores(); for (auto store : stores) { if (store->is_remote()) { @@ -120,6 +123,7 @@ OLAPStatus SnapshotManager::release_snapshot(const string& snapshot_path) { // AlphaRowsetMeta here. OLAPStatus SnapshotManager::convert_rowset_ids(const FilePathDesc& clone_dir_desc, int64_t tablet_id, const int32_t& schema_hash) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = OLAP_SUCCESS; // check clone dir existed if (!FileUtils::check_exist(clone_dir_desc.filepath)) { diff --git a/be/src/olap/snapshot_manager.h b/be/src/olap/snapshot_manager.h index 0efa64711befff..c13a133ab6a549 100644 --- a/be/src/olap/snapshot_manager.h +++ b/be/src/olap/snapshot_manager.h @@ -65,7 +65,10 @@ class SnapshotManager { const int32_t& schema_hash); private: - SnapshotManager() : _snapshot_base_id(0) {} + SnapshotManager() : _snapshot_base_id(0) { + _mem_tracker = MemTracker::create_tracker(-1, "SnapshotManager", nullptr, + MemTrackerLevel::OVERVIEW); + } OLAPStatus _calc_snapshot_id_path(const TabletSharedPtr& tablet, int64_t timeout_s, std::string* out_path); @@ -99,6 +102,8 @@ class SnapshotManager { // snapshot Mutex _snapshot_mutex; uint64_t _snapshot_base_id; + + std::shared_ptr _mem_tracker = nullptr; }; // SnapshotManager } // namespace doris diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 201bffb6649bb2..5c5c65e24fd4f4 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -112,10 +112,18 @@ StorageEngine::StorageEngine(const EngineOptions& options) _is_all_cluster_id_exist(true), _index_stream_lru_cache(nullptr), _file_cache(nullptr), - _compaction_mem_tracker(MemTracker::CreateTracker(-1, "AutoCompaction", nullptr, false, - false, MemTrackerLevel::OVERVIEW)), - _tablet_mem_tracker(MemTracker::CreateTracker(-1, "TabletHeader", nullptr, false, false, + _compaction_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::AutoCompaction", + nullptr, MemTrackerLevel::OVERVIEW)), + _tablet_mem_tracker(MemTracker::create_virtual_tracker( + -1, "StorageEngine::TabletHeader", nullptr, MemTrackerLevel::OVERVIEW)), + _schema_change_mem_tracker(MemTracker::create_tracker( + -1, "StorageEngine::SchemaChange", nullptr, MemTrackerLevel::OVERVIEW)), + _clone_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::Clone", nullptr, MemTrackerLevel::OVERVIEW)), + _batch_load_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::BatchLoad", + nullptr, MemTrackerLevel::OVERVIEW)), + _consistency_mem_tracker(MemTracker::create_tracker(-1, "StorageEngine::Consistency", + nullptr, MemTrackerLevel::OVERVIEW)), _stop_background_threads_latch(1), _tablet_manager(new TabletManager(config::tablet_map_shard_size)), _txn_manager(new TxnManager(config::txn_map_shard_size, config::txn_shard_size)), @@ -134,7 +142,7 @@ StorageEngine::StorageEngine(const EngineOptions& options) REGISTER_HOOK_METRIC(compaction_mem_consumption, [this]() { return _compaction_mem_tracker->consumption(); // We can get each compaction's detail usage - // LOG(INFO) << _compaction_mem_tracker=>LogUsage(2); + // LOG(INFO) << _compaction_mem_tracker=>log_usage(2); }); } @@ -1063,17 +1071,12 @@ bool StorageEngine::check_rowset_id_in_unused_rowsets(const RowsetId& rowset_id) void StorageEngine::create_cumulative_compaction( TabletSharedPtr best_tablet, std::shared_ptr& cumulative_compaction) { - std::string tracker_label = - "StorageEngine:CumulativeCompaction:" + std::to_string(best_tablet->tablet_id()); - cumulative_compaction.reset( - new CumulativeCompaction(best_tablet, tracker_label, _compaction_mem_tracker)); + cumulative_compaction.reset(new CumulativeCompaction(best_tablet)); } void StorageEngine::create_base_compaction(TabletSharedPtr best_tablet, std::shared_ptr& base_compaction) { - std::string tracker_label = - "StorageEngine:BaseCompaction:" + std::to_string(best_tablet->tablet_id()); - base_compaction.reset(new BaseCompaction(best_tablet, tracker_label, _compaction_mem_tracker)); + base_compaction.reset(new BaseCompaction(best_tablet)); } // Return json: diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index a07d3a98bed18e..83ac78d13c5fee 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -185,8 +185,12 @@ class StorageEngine { Status get_compaction_status_json(std::string* result); + std::shared_ptr compaction_mem_tracker() { return _compaction_mem_tracker; } std::shared_ptr tablet_mem_tracker() { return _tablet_mem_tracker; } std::shared_ptr schema_change_mem_tracker() { return _schema_change_mem_tracker; } + std::shared_ptr clone_mem_tracker() { return _clone_mem_tracker; } + std::shared_ptr batch_load_mem_tracker() { return _batch_load_mem_tracker; } + std::shared_ptr consistency_mem_tracker() { return _consistency_mem_tracker; } // check cumulative compaction config void check_cumulative_compaction_config(); @@ -326,6 +330,9 @@ class StorageEngine { std::shared_ptr _compaction_mem_tracker; std::shared_ptr _tablet_mem_tracker; std::shared_ptr _schema_change_mem_tracker; + std::shared_ptr _clone_mem_tracker; + std::shared_ptr _batch_load_mem_tracker; + std::shared_ptr _consistency_mem_tracker; CountDownLatch _stop_background_threads_latch; scoped_refptr _unused_rowset_monitor_thread; diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index b96290db74c2fb..910c23bbdb6503 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -639,17 +639,15 @@ OLAPStatus Tablet::_capture_consistent_rowsets_unlocked( } OLAPStatus Tablet::capture_rs_readers(const Version& spec_version, - std::vector* rs_readers, - std::shared_ptr parent_tracker) const { + std::vector* rs_readers) const { std::vector version_path; RETURN_NOT_OK(capture_consistent_versions(spec_version, &version_path)); - RETURN_NOT_OK(capture_rs_readers(version_path, rs_readers, parent_tracker)); + RETURN_NOT_OK(capture_rs_readers(version_path, rs_readers)); return OLAP_SUCCESS; } OLAPStatus Tablet::capture_rs_readers(const std::vector& version_path, - std::vector* rs_readers, - std::shared_ptr parent_tracker) const { + std::vector* rs_readers) const { DCHECK(rs_readers != nullptr && rs_readers->empty()); for (auto version : version_path) { auto it = _rs_version_map.find(version); @@ -666,7 +664,7 @@ OLAPStatus Tablet::capture_rs_readers(const std::vector& version_path, } } RowsetReaderSharedPtr rs_reader; - auto res = it->second->create_reader(parent_tracker, &rs_reader); + auto res = it->second->create_reader(&rs_reader); if (res != OLAP_SUCCESS) { LOG(WARNING) << "failed to create reader for rowset:" << it->second->rowset_id(); return OLAP_ERR_CAPTURE_ROWSET_READER_ERROR; diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 03a86f820a56ec..f8ab386acf96fd 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -124,12 +124,10 @@ class Tablet : public BaseTablet { OLAPStatus capture_consistent_rowsets(const Version& spec_version, std::vector* rowsets) const; OLAPStatus capture_rs_readers(const Version& spec_version, - std::vector* rs_readers, - std::shared_ptr parent_tracker = nullptr) const; + std::vector* rs_readers) const; OLAPStatus capture_rs_readers(const std::vector& version_path, - std::vector* rs_readers, - std::shared_ptr parent_tracker = nullptr) const; + std::vector* rs_readers) const; DelPredicateArray delete_predicates() { return _tablet_meta->delete_predicates(); } void add_delete_predicate(const DeletePredicatePB& delete_predicate, int64_t version); diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 2e7cd8a8a45c52..75a6a298fda9d6 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -49,6 +49,7 @@ #include "service/backend_options.h" #include "util/doris_metrics.h" #include "util/file_utils.h" +#include "runtime/thread_context.h" #include "util/histogram.h" #include "util/path_util.h" #include "util/pretty_printer.h" @@ -73,8 +74,8 @@ static bool _cmp_tablet_by_create_time(const TabletSharedPtr& a, const TabletSha } TabletManager::TabletManager(int32_t tablet_map_lock_shard_size) - : _mem_tracker(MemTracker::CreateTracker(-1, "TabletMeta", nullptr, false, false, - MemTrackerLevel::OVERVIEW)), + : _mem_tracker(MemTracker::create_virtual_tracker(-1, "TabletManager", nullptr, + MemTrackerLevel::OVERVIEW)), _tablets_shards_size(tablet_map_lock_shard_size), _tablets_shards_mask(tablet_map_lock_shard_size - 1), _last_update_stat_ms(0) { @@ -89,7 +90,6 @@ TabletManager::TabletManager(int32_t tablet_map_lock_shard_size) } TabletManager::~TabletManager() { - _mem_tracker->Release(_mem_tracker->consumption()); DEREGISTER_HOOK_METRIC(tablet_meta_mem_consumption); } @@ -204,7 +204,7 @@ OLAPStatus TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id, Schem // TODO: remove multiply 2 of tablet meta mem size // Because table schema will copy in tablet, there will be double mem cost // so here multiply 2 - _mem_tracker->Consume(tablet->tablet_meta()->mem_size() * 2); + _mem_tracker->consume(tablet->tablet_meta()->mem_size() * 2); VLOG_NOTICE << "add tablet to map successfully." << " tablet_id=" << tablet_id << ", schema_hash=" << schema_hash; @@ -1368,7 +1368,7 @@ OLAPStatus TabletManager::_drop_tablet_directly_unlocked(TTabletId tablet_id, } dropped_tablet->deregister_tablet_from_dir(); - _mem_tracker->Release(dropped_tablet->tablet_meta()->mem_size() * 2); + _mem_tracker->release(dropped_tablet->tablet_meta()->mem_size() * 2); return OLAP_SUCCESS; } diff --git a/be/src/olap/task/engine_alter_tablet_task.cpp b/be/src/olap/task/engine_alter_tablet_task.cpp index 51f029648a20d5..32ee5852510de6 100644 --- a/be/src/olap/task/engine_alter_tablet_task.cpp +++ b/be/src/olap/task/engine_alter_tablet_task.cpp @@ -18,6 +18,7 @@ #include "olap/task/engine_alter_tablet_task.h" #include "olap/schema_change.h" +#include "runtime/thread_context.h" namespace doris { @@ -31,9 +32,16 @@ EngineAlterTabletTask::EngineAlterTabletTask(const TAlterTabletReqV2& request, i _signature(signature), _task_type(task_type), _error_msgs(error_msgs), - _process_name(process_name) {} + _process_name(process_name) { + _mem_tracker = MemTracker::create_tracker( + config::memory_limitation_per_thread_for_schema_change * 1024 * 1024 * 1024, + fmt::format("{}: {}-{}", process_name, std::to_string(_alter_tablet_req.base_tablet_id), + std::to_string(_alter_tablet_req.new_tablet_id)), + StorageEngine::instance()->schema_change_mem_tracker(), MemTrackerLevel::TASK); +} OLAPStatus EngineAlterTabletTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DorisMetrics::instance()->create_rollup_requests_total->increment(1); auto schema_change_handler = SchemaChangeHandler::instance(); diff --git a/be/src/olap/task/engine_alter_tablet_task.h b/be/src/olap/task/engine_alter_tablet_task.h index 73dd5514ac9da8..d02b3ac286a293 100644 --- a/be/src/olap/task/engine_alter_tablet_task.h +++ b/be/src/olap/task/engine_alter_tablet_task.h @@ -43,6 +43,7 @@ class EngineAlterTabletTask : public EngineTask { vector* _error_msgs; const string& _process_name; + std::shared_ptr _mem_tracker; }; // EngineTask } // namespace doris diff --git a/be/src/olap/task/engine_batch_load_task.cpp b/be/src/olap/task/engine_batch_load_task.cpp index a2750155415795..d73256df5b647f 100644 --- a/be/src/olap/task/engine_batch_load_task.cpp +++ b/be/src/olap/task/engine_batch_load_task.cpp @@ -35,6 +35,7 @@ #include "olap/push_handler.h" #include "olap/storage_engine.h" #include "olap/tablet.h" +#include "runtime/thread_context.h" #include "util/doris_metrics.h" #include "util/pretty_printer.h" @@ -52,11 +53,15 @@ EngineBatchLoadTask::EngineBatchLoadTask(TPushReq& push_req, std::vectorbatch_load_mem_tracker(), MemTrackerLevel::TASK); } EngineBatchLoadTask::~EngineBatchLoadTask() {} OLAPStatus EngineBatchLoadTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); AgentStatus status = DORIS_SUCCESS; if (_push_req.push_type == TPushType::LOAD || _push_req.push_type == TPushType::LOAD_DELETE || _push_req.push_type == TPushType::LOAD_V2) { diff --git a/be/src/olap/task/engine_batch_load_task.h b/be/src/olap/task/engine_batch_load_task.h index 125dc7fc149eb6..3e9d92c71a6891 100644 --- a/be/src/olap/task/engine_batch_load_task.h +++ b/be/src/olap/task/engine_batch_load_task.h @@ -77,6 +77,7 @@ class EngineBatchLoadTask : public EngineTask { AgentStatus* _res_status; std::string _remote_file_path; std::string _local_file_path; + std::shared_ptr _mem_tracker; }; // class Pusher } // namespace doris #endif // DORIS_BE_SRC_OLAP_TASK_ENGINE_BATCH_LOAD_TASK_H diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp index b795efb834ab27..be44f0ac65a4a2 100644 --- a/be/src/olap/task/engine_checksum_task.cpp +++ b/be/src/olap/task/engine_checksum_task.cpp @@ -17,8 +17,9 @@ #include "olap/task/engine_checksum_task.h" -#include "olap/tuple_reader.h" #include "olap/row.h" +#include "olap/tuple_reader.h" +#include "runtime/thread_context.h" namespace doris { @@ -27,9 +28,14 @@ EngineChecksumTask::EngineChecksumTask(TTabletId tablet_id, TSchemaHash schema_h : _tablet_id(tablet_id), _schema_hash(schema_hash), _version(version), - _checksum(checksum) {} + _checksum(checksum) { + _mem_tracker = MemTracker::create_tracker(-1, "compute checksum: " + std::to_string(tablet_id), + StorageEngine::instance()->consistency_mem_tracker(), + MemTrackerLevel::TASK); +} OLAPStatus EngineChecksumTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); OLAPStatus res = _compute_checksum(); return res; } // execute @@ -87,8 +93,7 @@ OLAPStatus EngineChecksumTask::_compute_checksum() { } RowCursor row; - std::shared_ptr tracker(new MemTracker(-1)); - std::unique_ptr mem_pool(new MemPool(tracker.get())); + std::unique_ptr mem_pool(new MemPool("EngineChecksumTask:_compute_checksum")); std::unique_ptr agg_object_pool(new ObjectPool()); res = row.init(tablet->tablet_schema(), reader_params.return_columns); if (res != OLAP_SUCCESS) { diff --git a/be/src/olap/task/engine_checksum_task.h b/be/src/olap/task/engine_checksum_task.h index 7f0cdb6e6bdcc8..0430c560e99101 100644 --- a/be/src/olap/task/engine_checksum_task.h +++ b/be/src/olap/task/engine_checksum_task.h @@ -44,6 +44,7 @@ class EngineChecksumTask : public EngineTask { TSchemaHash _schema_hash; TVersion _version; uint32_t* _checksum; + std::shared_ptr _mem_tracker; }; // EngineTask } // namespace doris diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 71e73a3b642ab9..0ae6c7fb498367 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -30,6 +30,7 @@ #include "olap/rowset/rowset_factory.h" #include "olap/snapshot_manager.h" #include "runtime/client_cache.h" +#include "runtime/thread_context.h" #include "util/thrift_rpc_helper.h" using std::set; @@ -55,9 +56,14 @@ EngineCloneTask::EngineCloneTask(const TCloneReq& clone_req, const TMasterInfo& _tablet_infos(tablet_infos), _res_status(res_status), _signature(signature), - _master_info(master_info) {} + _master_info(master_info) { + _mem_tracker = MemTracker::create_tracker( + -1, "clone tablet: " + std::to_string(_clone_req.tablet_id), + StorageEngine::instance()->clone_mem_tracker(), MemTrackerLevel::TASK); +} OLAPStatus EngineCloneTask::execute() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // register the tablet to avoid it is deleted by gc thread during clone process StorageEngine::instance()->tablet_manager()->register_clone_tablet(_clone_req.tablet_id); OLAPStatus st = _do_clone(); @@ -758,9 +764,9 @@ OLAPStatus EngineCloneTask::_finish_full_clone(Tablet* tablet, TabletMeta* clone // but some rowset is useless, so that remove them here for (auto& rs_meta_ptr : rs_metas_found_in_src) { RowsetSharedPtr rowset_to_remove; - auto s = - RowsetFactory::create_rowset(&(cloned_tablet_meta->tablet_schema()), - tablet->tablet_path_desc().filepath, rs_meta_ptr, &rowset_to_remove); + auto s = RowsetFactory::create_rowset(&(cloned_tablet_meta->tablet_schema()), + tablet->tablet_path_desc().filepath, rs_meta_ptr, + &rowset_to_remove); if (s != OLAP_SUCCESS) { LOG(WARNING) << "failed to init rowset to remove: " << rs_meta_ptr->rowset_id().to_string(); diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index dbe3c1fef81b63..508e5e755dd0bd 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -76,6 +76,7 @@ class EngineCloneTask : public EngineTask { const TMasterInfo& _master_info; int64_t _copy_size; int64_t _copy_time_ms; + std::shared_ptr _mem_tracker; }; // EngineTask } // namespace doris diff --git a/be/src/olap/tuple_reader.cpp b/be/src/olap/tuple_reader.cpp index 5c15c2b42f9741..93ba2513d9866a 100644 --- a/be/src/olap/tuple_reader.cpp +++ b/be/src/olap/tuple_reader.cpp @@ -30,7 +30,6 @@ #include "olap/schema.h" #include "olap/storage_engine.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "util/date_func.h" using std::nothrow; diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt index 142934d5cbf602..a1ca44b15e51d2 100644 --- a/be/src/runtime/CMakeLists.txt +++ b/be/src/runtime/CMakeLists.txt @@ -46,7 +46,10 @@ set(RUNTIME_FILES runtime_state.cpp runtime_filter_mgr.cpp string_value.cpp + thread_context.cpp + thread_mem_tracker_mgr.cpp thread_resource_mgr.cpp + threadlocal.cc decimalv2_value.cpp large_int_value.cpp collection_value.cpp @@ -67,6 +70,7 @@ set(RUNTIME_FILES disk_io_mgr_scan_range.cc buffered_block_mgr2.cc mem_tracker.cpp + mem_tracker_task_pool.cpp spill_sorter.cc sorted_run_merger.cc data_stream_recvr.cc diff --git a/be/src/runtime/buffered_block_mgr2.cc b/be/src/runtime/buffered_block_mgr2.cc index 92edcdcabe5c3e..09de2b4a1b2089 100644 --- a/be/src/runtime/buffered_block_mgr2.cc +++ b/be/src/runtime/buffered_block_mgr2.cc @@ -22,6 +22,7 @@ #include "runtime/mem_pool.h" #include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tmp_file_mgr.h" #include "util/bit_util.h" #include "util/debug_util.h" @@ -100,8 +101,7 @@ class BufferedBlockMgr2::Client { DCHECK(buffer != nullptr); if (buffer->len == _mgr->max_block_size()) { ++_num_pinned_buffers; - _tracker->ConsumeLocal(buffer->len, _query_tracker.get()); - // _tracker->Consume(buffer->len); + _tracker->consume(buffer->len, _query_tracker.get()); } } @@ -110,8 +110,7 @@ class BufferedBlockMgr2::Client { if (buffer->len == _mgr->max_block_size()) { DCHECK_GT(_num_pinned_buffers, 0); --_num_pinned_buffers; - _tracker->ReleaseLocal(buffer->len, _query_tracker.get()); - // _tracker->Release(buffer->len); + _tracker->release(buffer->len, _query_tracker.get()); } } @@ -221,11 +220,9 @@ BufferedBlockMgr2::BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_m _writes_issued(0), _state(state) {} -Status BufferedBlockMgr2::create(RuntimeState* state, const std::shared_ptr& parent, - RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, - int64_t mem_limit, int64_t block_size, +Status BufferedBlockMgr2::create(RuntimeState* state, RuntimeProfile* profile, + TmpFileMgr* tmp_file_mgr, int64_t mem_limit, int64_t block_size, std::shared_ptr* block_mgr) { - DCHECK(parent != nullptr); block_mgr->reset(); { // we do not use global BlockMgrsMap for now, to avoid mem-exceeded different fragments @@ -247,7 +244,7 @@ Status BufferedBlockMgr2::create(RuntimeState* state, const std::shared_ptrquery_id()] = *block_mgr; } } - (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile, parent, mem_limit); + (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile, mem_limit); return Status::OK(); } @@ -261,7 +258,7 @@ int64_t BufferedBlockMgr2::available_buffers(Client* client) const { int64_t BufferedBlockMgr2::remaining_unreserved_buffers() const { int64_t num_buffers = _free_io_buffers.size() + _unpinned_blocks.size() + _non_local_outstanding_writes; - num_buffers += _mem_tracker->SpareCapacity(MemLimit::HARD) / max_block_size(); + num_buffers += _mem_tracker->spare_capacity() / max_block_size(); num_buffers -= _unfullfilled_reserved_buffers; return num_buffers; } @@ -292,6 +289,7 @@ void BufferedBlockMgr2::clear_reservations(Client* client) { bool BufferedBlockMgr2::try_acquire_tmp_reservation(Client* client, int num_buffers) { lock_guard lock(_lock); + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); // TODO: Can the modifications to the client's mem variables can be made w/o the lock? DCHECK_EQ(client->_num_tmp_reserved_buffers, 0); if (client->_num_pinned_buffers < client->_num_reserved_buffers) { @@ -311,6 +309,7 @@ bool BufferedBlockMgr2::try_acquire_tmp_reservation(Client* client, int num_buff } bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); // Later, we use this interface to manage the consumption of memory of hashtable instead of ReservationTracker. // So it is possible to allocate 0, which has no additional impact on the behavior of BufferedBlockMgr. // The process of memory allocation still by BufferPool, Because bufferpool has done a lot of optimization in memory allocation @@ -324,24 +323,24 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { } int buffers_needed = BitUtil::ceil(size, max_block_size()); unique_lock lock(_lock); - Status st = _mem_tracker->TryConsume(size); + Status st = _mem_tracker->try_consume(size); WARN_IF_ERROR(st, "consume failed"); if (size < max_block_size() && st) { // For small allocations (less than a block size), just let the allocation through. - client->_tracker->ConsumeLocal(size, client->_query_tracker.get()); - // client->_tracker->Consume(size); + client->_tracker->consume(size, client->_query_tracker.get()); + // client->_tracker->consume(size); return true; } if (available_buffers(client) + client->_num_tmp_reserved_buffers < buffers_needed) { return false; } - st = _mem_tracker->TryConsume(size); + st = _mem_tracker->try_consume(size); WARN_IF_ERROR(st, "consume failed"); if (st) { // There was still unallocated memory, don't need to recycle allocated blocks. - client->_tracker->ConsumeLocal(size, client->_query_tracker.get()); - // client->_tracker->Consume(size); + client->_tracker->consume(size, client->_query_tracker.get()); + // client->_tracker->consume(size); return true; } @@ -386,7 +385,7 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { } client->_num_tmp_reserved_buffers -= additional_tmp_reservations; _unfullfilled_reserved_buffers -= additional_tmp_reservations; - _mem_tracker->Release(buffers_acquired * max_block_size()); + _mem_tracker->release(buffers_acquired * max_block_size()); return false; } @@ -394,21 +393,21 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) { _unfullfilled_reserved_buffers -= buffers_acquired; DCHECK_GE(buffers_acquired * max_block_size(), size); - _mem_tracker->Release(buffers_acquired * max_block_size()); - st = _mem_tracker->TryConsume(size); + _mem_tracker->release(buffers_acquired * max_block_size()); + st = _mem_tracker->try_consume(size); WARN_IF_ERROR(st, "consume failed"); if (!st) { return false; } - client->_tracker->ConsumeLocal(size, client->_query_tracker.get()); - // client->_tracker->Consume(size); + client->_tracker->consume(size, client->_query_tracker.get()); + // client->_tracker->consume(size); DCHECK(validate()) << endl << debug_internal(); return true; } void BufferedBlockMgr2::release_memory(Client* client, int64_t size) { - _mem_tracker->Release(size); - client->_tracker->ReleaseLocal(size, client->_query_tracker.get()); + _mem_tracker->release(size); + client->_tracker->release(size, client->_query_tracker.get()); } void BufferedBlockMgr2::cancel() { @@ -453,6 +452,7 @@ Status BufferedBlockMgr2::add_exec_msg(const std::string& msg) const { Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Block** block, int64_t len) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK_LE(len, _max_block_size) << "Cannot request block bigger than max_len"; DCHECK_NE(len, 0) << "Cannot request block of zero size"; *block = nullptr; @@ -469,7 +469,7 @@ Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Bloc if (len > 0 && len < _max_block_size) { DCHECK(unpin_block == nullptr); - Status st = client->_tracker->TryConsume(len); + Status st = client->_tracker->try_consume(len); WARN_IF_ERROR(st, "get_new_block failed"); if (st) { // TODO: Have a cache of unused blocks of size 'len' (0, _max_block_size) @@ -517,6 +517,7 @@ Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Bloc } Status BufferedBlockMgr2::transfer_buffer(Block* dst, Block* src, bool unpin) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); Status status = Status::OK(); DCHECK(dst != nullptr); DCHECK(src != nullptr); @@ -561,6 +562,7 @@ Status BufferedBlockMgr2::transfer_buffer(Block* dst, Block* src, bool unpin) { } BufferedBlockMgr2::~BufferedBlockMgr2() { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); { lock_guard lock(_s_block_mgrs_lock); BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(_query_id); @@ -600,7 +602,7 @@ BufferedBlockMgr2::~BufferedBlockMgr2() { // Free memory resources. for (BufferDescriptor* buffer : _all_io_buffers) { - _mem_tracker->Release(buffer->len); + _mem_tracker->release(buffer->len); delete[] buffer->buffer; } DCHECK_EQ(_mem_tracker->consumption(), 0); @@ -638,6 +640,7 @@ Status BufferedBlockMgr2::delete_or_unpin_block(Block* block, bool unpin) { } Status BufferedBlockMgr2::pin_block(Block* block, bool* pinned, Block* release_block, bool unpin) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(block != nullptr); DCHECK(!block->_is_deleted); *pinned = false; @@ -718,6 +721,7 @@ Status BufferedBlockMgr2::pin_block(Block* block, bool* pinned, Block* release_b } Status BufferedBlockMgr2::unpin_block(Block* block) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(!block->_is_deleted) << "Unpin for deleted block."; lock_guard unpinned_lock(_lock); @@ -920,6 +924,7 @@ void BufferedBlockMgr2::write_complete(Block* block, const Status& write_status) } void BufferedBlockMgr2::delete_block(Block* block) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(!block->_is_deleted); lock_guard lock(_lock); @@ -954,7 +959,7 @@ void BufferedBlockMgr2::delete_block(Block* block) { if (block->_buffer_desc->len != _max_block_size) { // Just delete the block for now. delete[] block->_buffer_desc->buffer; - block->_client->_tracker->Release(block->_buffer_desc->len); + block->_client->_tracker->release(block->_buffer_desc->len); delete block->_buffer_desc; block->_buffer_desc = nullptr; } else { @@ -1094,7 +1099,7 @@ Status BufferedBlockMgr2::find_buffer_for_block(Block* block, bool* in_mem) { Status BufferedBlockMgr2::find_buffer(unique_lock& lock, BufferDescriptor** buffer_desc) { *buffer_desc = nullptr; - Status st = _mem_tracker->TryConsume(_max_block_size); + Status st = _mem_tracker->try_consume(_max_block_size); WARN_IF_ERROR(st, "try to allocate a new buffer failed"); // First, try to allocate a new buffer. if (_free_io_buffers.size() < _block_write_threshold && st) { @@ -1262,15 +1267,13 @@ string BufferedBlockMgr2::debug_internal() const { << " Num available buffers: " << remaining_unreserved_buffers() << endl << " Total pinned buffers: " << _total_pinned_buffers << endl << " Unfullfilled reserved buffers: " << _unfullfilled_reserved_buffers << endl - << " Remaining memory: " << _mem_tracker->SpareCapacity(MemLimit::HARD) - << " (#blocks=" << (_mem_tracker->SpareCapacity(MemLimit::HARD) / _max_block_size) << ")" - << endl + << " Remaining memory: " << _mem_tracker->spare_capacity() + << " (#blocks=" << (_mem_tracker->spare_capacity() / _max_block_size) << ")" << endl << " Block write threshold: " << _block_write_threshold; return ss.str(); } -void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, - const std::shared_ptr& parent_tracker, int64_t mem_limit) { +void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, int64_t mem_limit) { unique_lock l(_lock); if (_initialized) { return; @@ -1295,7 +1298,7 @@ void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, _integrity_check_timer = ADD_TIMER(_profile.get(), "TotalIntegrityCheckTime"); // Create a new mem_tracker and allocate buffers. - _mem_tracker = MemTracker::CreateTracker(mem_limit, "BufferedBlockMgr2", parent_tracker); + _mem_tracker = MemTracker::create_tracker(mem_limit, "BufferedBlockMgr2"); _initialized = true; } diff --git a/be/src/runtime/buffered_block_mgr2.h b/be/src/runtime/buffered_block_mgr2.h index 493398e8d9d5e6..1aeddba9e75828 100644 --- a/be/src/runtime/buffered_block_mgr2.h +++ b/be/src/runtime/buffered_block_mgr2.h @@ -283,9 +283,9 @@ class BufferedBlockMgr2 { // same query id has already been created, that block mgr is returned. // - mem_limit: maximum memory that will be used by the block mgr. // - buffer_size: maximum size of each buffer. - static Status create(RuntimeState* state, const std::shared_ptr& parent, - RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, int64_t mem_limit, - int64_t buffer_size, std::shared_ptr* block_mgr); + static Status create(RuntimeState* state, RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, + int64_t mem_limit, int64_t buffer_size, + std::shared_ptr* block_mgr); ~BufferedBlockMgr2(); @@ -406,8 +406,7 @@ class BufferedBlockMgr2 { BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_mgr, int64_t block_size); // Initializes the block mgr. Idempotent and thread-safe. - void init(DiskIoMgr* io_mgr, RuntimeProfile* profile, - const std::shared_ptr& parent_tracker, int64_t mem_limit); + void init(DiskIoMgr* io_mgr, RuntimeProfile* profile, int64_t mem_limit); // Initializes _tmp_files. This is initialized the first time we need to write to disk. // Must be called with _lock taken. diff --git a/be/src/runtime/buffered_tuple_stream2.cc b/be/src/runtime/buffered_tuple_stream2.cc index d0c9428ffee0a7..6f55086e8befb5 100644 --- a/be/src/runtime/buffered_tuple_stream2.cc +++ b/be/src/runtime/buffered_tuple_stream2.cc @@ -484,7 +484,7 @@ Status BufferedTupleStream2::get_rows(unique_ptr* batch, bool* got_row return Status::OK(); } RETURN_IF_ERROR(prepare_for_read(false)); - batch->reset(new RowBatch(_desc, num_rows(), _block_mgr->get_tracker(_block_mgr_client).get())); + batch->reset(new RowBatch(_desc, num_rows())); bool eos = false; // Loop until get_next fills the entire batch. Each call can stop at block // boundaries. We generally want it to stop, so that blocks can be freed diff --git a/be/src/runtime/buffered_tuple_stream3.cc b/be/src/runtime/buffered_tuple_stream3.cc index e5bdb9ecd0e532..0da366a645a612 100644 --- a/be/src/runtime/buffered_tuple_stream3.cc +++ b/be/src/runtime/buffered_tuple_stream3.cc @@ -18,10 +18,8 @@ #include #include "runtime/buffered_tuple_stream3.inline.h" -#include "runtime/bufferpool/reservation_tracker.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "runtime/mem_tracker.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" #include "runtime/string_value.h" @@ -669,8 +667,7 @@ void BufferedTupleStream3::UnpinStream(UnpinMode mode) { CHECK_CONSISTENCY_FULL(); } */ -Status BufferedTupleStream3::GetRows(const std::shared_ptr& tracker, - std::unique_ptr* batch, bool* got_rows) { +Status BufferedTupleStream3::GetRows(std::unique_ptr* batch, bool* got_rows) { if (num_rows() > numeric_limits::max()) { // RowBatch::num_rows_ is a 32-bit int, avoid an overflow. return Status::InternalError( @@ -687,7 +684,7 @@ Status BufferedTupleStream3::GetRows(const std::shared_ptr& tracker, // TODO chenhao // capacity in RowBatch use int, but _num_rows is int64_t // it may be precision loss - batch->reset(new RowBatch(*desc_, num_rows(), tracker.get())); + batch->reset(new RowBatch(*desc_, num_rows())); bool eos = false; // Loop until GetNext fills the entire batch. Each call can stop at page // boundaries. We generally want it to stop, so that pages can be freed diff --git a/be/src/runtime/buffered_tuple_stream3.h b/be/src/runtime/buffered_tuple_stream3.h index 7d8f053d037085..b9d6c13e28801f 100644 --- a/be/src/runtime/buffered_tuple_stream3.h +++ b/be/src/runtime/buffered_tuple_stream3.h @@ -30,7 +30,6 @@ namespace doris { -class MemTracker; class RuntimeState; class RowDescriptor; class SlotDescriptor; @@ -333,8 +332,7 @@ class BufferedTupleStream3 { /// process. If the current unused reservation is not sufficient to pin the stream in /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set /// to false. - Status GetRows(const std::shared_ptr& tracker, std::unique_ptr* batch, - bool* got_rows) WARN_UNUSED_RESULT; + Status GetRows(std::unique_ptr* batch, bool* got_rows) WARN_UNUSED_RESULT; /// Must be called once at the end to cleanup all resources. If 'batch' is non-nullptr, /// attaches buffers from pinned pages that rows returned from GetNext() may reference. diff --git a/be/src/runtime/bufferpool/buffer_allocator.cc b/be/src/runtime/bufferpool/buffer_allocator.cc index a3bbe4c6c2fdc4..58bd873f74f5e0 100644 --- a/be/src/runtime/bufferpool/buffer_allocator.cc +++ b/be/src/runtime/bufferpool/buffer_allocator.cc @@ -22,6 +22,7 @@ #include "common/atomic.h" #include "common/config.h" #include "runtime/bufferpool/system_allocator.h" +#include "runtime/thread_context.h" #include "util/bit_util.h" #include "util/cpu_info.h" #include "util/pretty_printer.h" @@ -220,6 +221,7 @@ Status BufferPool::BufferAllocator::Allocate(ClientHandle* client, int64_t len, COUNTER_UPDATE(client->impl_->counters().cumulative_allocations, 1); RETURN_IF_ERROR(AllocateInternal(len, buffer)); + // thread_local_ctx.get()->consume_mem(len); DCHECK(buffer->is_open()); buffer->client_ = client; return Status::OK(); @@ -245,7 +247,9 @@ Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* const int current_core = CpuInfo::get_current_core(); // Fast path: recycle a buffer of the correct size from this core's arena. FreeBufferArena* current_core_arena = per_core_arenas_[current_core].get(); - if (current_core_arena->PopFreeBuffer(len, buffer)) return Status::OK(); + if (current_core_arena->PopFreeBuffer(len, buffer)) { + return Status::OK(); + } // Fast-ish path: allocate a new buffer if there is room in 'system_bytes_remaining_'. int64_t delta = DecreaseBytesRemaining(len, true, &system_bytes_remaining_); @@ -264,7 +268,9 @@ Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* // Each core should start searching from a different point to avoid hot-spots. int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()]; FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get(); - if (other_core_arena->PopFreeBuffer(len, buffer)) return Status::OK(); + if (other_core_arena->PopFreeBuffer(len, buffer)) { + return Status::OK(); + } } /* @@ -298,7 +304,11 @@ Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* } // We have headroom to allocate a new buffer at this point. DCHECK_EQ(delta, len); - Status status = system_allocator_->Allocate(len, buffer); + Status status; + { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + status = system_allocator_->Allocate(len, buffer); + } if (!status.ok()) { system_bytes_remaining_.add(len); return status; @@ -375,6 +385,7 @@ void BufferPool::BufferAllocator::Free(BufferHandle&& handle) { handle.client_ = nullptr; // Buffer is no longer associated with a client. FreeBufferArena* arena = per_core_arenas_[handle.home_core_].get(); handle.Poison(); + // thread_local_ctx.get()->release_mem(handle.len()); arena->AddFreeBuffer(std::move(handle)); } @@ -420,6 +431,7 @@ int BufferPool::BufferAllocator::GetFreeListSize(int core, int64_t len) { int64_t BufferPool::BufferAllocator::FreeToSystem(std::vector&& buffers) { int64_t bytes_freed = 0; + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); for (BufferHandle& buffer : buffers) { bytes_freed += buffer.len(); // Ensure that the memory is unpoisoned when it's next allocated by the system. diff --git a/be/src/runtime/bufferpool/buffer_pool.cc b/be/src/runtime/bufferpool/buffer_pool.cc index c0660bd77b135e..3ff0a2e10ef90a 100644 --- a/be/src/runtime/bufferpool/buffer_pool.cc +++ b/be/src/runtime/bufferpool/buffer_pool.cc @@ -378,8 +378,7 @@ BufferPool::Client::Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group buffers_allocated_bytes_(0) { // Set up a child profile with buffer pool info. RuntimeProfile* child_profile = profile->create_child("Buffer pool", true, true); - reservation_.InitChildTracker(child_profile, parent_reservation, mem_tracker.get(), - reservation_limit); + reservation_.InitChildTracker(child_profile, parent_reservation, nullptr, reservation_limit); counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime"); counters_.cumulative_allocations = ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT); diff --git a/be/src/runtime/bufferpool/reservation_tracker.cc b/be/src/runtime/bufferpool/reservation_tracker.cc index 4fa41d85e1b751..1e56441a0d8fd4 100644 --- a/be/src/runtime/bufferpool/reservation_tracker.cc +++ b/be/src/runtime/bufferpool/reservation_tracker.cc @@ -60,7 +60,7 @@ void ReservationTracker::InitChildTracker(RuntimeProfile* profile, ReservationTr std::lock_guard l(lock_); DCHECK(!initialized_); parent_ = parent; - mem_tracker_ = mem_tracker; + mem_tracker_ = nullptr; // TODO(zxy) remove ReservationTracker later reservation_limit_ = reservation_limit; reservation_ = 0; @@ -75,8 +75,8 @@ void ReservationTracker::InitChildTracker(RuntimeProfile* profile, ReservationTr DCHECK_EQ(parent_mem_tracker, mem_tracker_->parent().get()); // Make sure we don't have a lower limit than the ancestor, since we don't enforce // limits at lower links. - DCHECK_EQ(mem_tracker_->GetLowestLimit(MemLimit::HARD), - parent_mem_tracker->GetLowestLimit(MemLimit::HARD)); + DCHECK_EQ(mem_tracker_->get_lowest_limit(), + parent_mem_tracker->get_lowest_limit()); } else { // Make sure we didn't leave a gap in the links. E.g. this tracker's grandparent // shouldn't have a MemTracker. @@ -110,7 +110,6 @@ void ReservationTracker::InitCounters(RuntimeProfile* profile, int64_t reservati counters_.reservation_limit = ADD_COUNTER(profile, "ReservationLimit", TUnit::BYTES); COUNTER_SET(counters_.reservation_limit, reservation_limit); } - if (mem_tracker_ != nullptr) mem_tracker_->EnableReservationReporting(counters_); } void ReservationTracker::Close() { @@ -187,14 +186,14 @@ bool ReservationTracker::TryConsumeFromMemTracker(int64_t reservation_increase) if (GetParentMemTracker() == nullptr) { // At the topmost link, which may be a MemTracker with a limit, we need to use // TryConsume() to check the limit. - Status st = mem_tracker_->TryConsume(reservation_increase); + Status st = mem_tracker_->try_consume(reservation_increase); WARN_IF_ERROR(st, "TryConsumeFromMemTracker failed"); return st.ok(); } else { // For lower links, there shouldn't be a limit to enforce, so we just need to // update the consumption of the linked MemTracker since the reservation is // already reflected in its parent. - mem_tracker_->ConsumeLocal(reservation_increase, GetParentMemTracker()); + mem_tracker_->consume(reservation_increase, GetParentMemTracker()); return true; } } @@ -203,9 +202,9 @@ void ReservationTracker::ReleaseToMemTracker(int64_t reservation_decrease) { DCHECK_GE(reservation_decrease, 0); if (mem_tracker_ == nullptr) return; if (GetParentMemTracker() == nullptr) { - mem_tracker_->Release(reservation_decrease); + mem_tracker_->release(reservation_decrease); } else { - mem_tracker_->ReleaseLocal(reservation_decrease, GetParentMemTracker()); + mem_tracker_->release(reservation_decrease, GetParentMemTracker()); } } diff --git a/be/src/runtime/cache/result_cache.h b/be/src/runtime/cache/result_cache.h index 910cc191ee3948..7e4352ac7946d1 100644 --- a/be/src/runtime/cache/result_cache.h +++ b/be/src/runtime/cache/result_cache.h @@ -33,7 +33,6 @@ #include "runtime/cache/cache_utils.h" #include "runtime/cache/result_node.h" #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "runtime/row_batch.h" #include "runtime/tuple_row.h" diff --git a/be/src/runtime/data_stream_mgr.cpp b/be/src/runtime/data_stream_mgr.cpp index fbe2af1ca74d5b..6363c61d4b3346 100644 --- a/be/src/runtime/data_stream_mgr.cpp +++ b/be/src/runtime/data_stream_mgr.cpp @@ -71,7 +71,7 @@ shared_ptr DataStreamMgr::create_recvr( VLOG_FILE << "creating receiver for fragment=" << fragment_instance_id << ", node=" << dest_node_id; shared_ptr recvr(new DataStreamRecvr( - this, state->instance_mem_tracker(), row_desc, fragment_instance_id, dest_node_id, + this, row_desc, fragment_instance_id, dest_node_id, num_senders, is_merging, buffer_size, profile, sub_plan_query_statistics_recvr)); uint32_t hash_value = get_hash_value(fragment_instance_id, dest_node_id); lock_guard l(_lock); diff --git a/be/src/runtime/data_stream_mgr.h b/be/src/runtime/data_stream_mgr.h index be370603c0e4dc..e627de17276d0f 100644 --- a/be/src/runtime/data_stream_mgr.h +++ b/be/src/runtime/data_stream_mgr.h @@ -30,7 +30,6 @@ #include "gen_cpp/Types_types.h" // for TUniqueId #include "gen_cpp/internal_service.pb.h" #include "runtime/descriptors.h" // for PlanNodeId -#include "runtime/mem_tracker.h" #include "runtime/query_statistics.h" #include "util/runtime_profile.h" diff --git a/be/src/runtime/data_stream_recvr.cc b/be/src/runtime/data_stream_recvr.cc index 962395ad2616ed..f80d12918541bb 100644 --- a/be/src/runtime/data_stream_recvr.cc +++ b/be/src/runtime/data_stream_recvr.cc @@ -28,6 +28,7 @@ #include "runtime/data_stream_mgr.h" #include "runtime/row_batch.h" #include "runtime/sorted_run_merger.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "util/logging.h" #include "util/runtime_profile.h" @@ -184,6 +185,8 @@ Status DataStreamRecvr::SenderQueue::get_batch(RowBatch** next_batch) { if (!_pending_closures.empty()) { auto closure_pair = _pending_closures.front(); + // When the batch queue reaches the upper limit of memory, calling run to let + // brpc send data packets may cause additional memory to be released closure_pair.first->Run(); _pending_closures.pop_front(); @@ -248,7 +251,7 @@ void DataStreamRecvr::SenderQueue::add_batch(const PRowBatch& pb_batch, int be_n // Note: if this function makes a row batch, the batch *must* be added // to _batch_queue. It is not valid to create the row batch and destroy // it in this thread. - batch = new RowBatch(_recvr->row_desc(), pb_batch, _recvr->mem_tracker().get()); + batch = new RowBatch(_recvr->row_desc(), pb_batch); } VLOG_ROW << "added #rows=" << batch->num_rows() << " batch_size=" << batch_size << "\n"; @@ -270,8 +273,7 @@ void DataStreamRecvr::SenderQueue::add_batch(RowBatch* batch, bool use_move) { if (_is_cancelled) { return; } - RowBatch* nbatch = - new RowBatch(_recvr->row_desc(), batch->capacity(), _recvr->mem_tracker().get()); + RowBatch* nbatch = new RowBatch(_recvr->row_desc(), batch->capacity()); if (use_move) { nbatch->acquire_state(batch); } else { @@ -360,6 +362,7 @@ void DataStreamRecvr::SenderQueue::close() { Status DataStreamRecvr::create_merger(const TupleRowComparator& less_than) { DCHECK(_is_merging); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); vector child_input_batch_suppliers; // Create the merger that will a single stream of sorted rows. _merger.reset(new SortedRunMerger(less_than, &_row_desc, _profile, false)); @@ -373,8 +376,9 @@ Status DataStreamRecvr::create_merger(const TupleRowComparator& less_than) { } Status DataStreamRecvr::create_parallel_merger(const TupleRowComparator& less_than, - uint32_t batch_size, MemTracker* mem_tracker) { + uint32_t batch_size) { DCHECK(_is_merging); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); vector child_input_batch_suppliers; // Create the merger that will a single stream of sorted rows. @@ -398,8 +402,8 @@ Status DataStreamRecvr::create_parallel_merger(const TupleRowComparator& less_th auto step = _sender_queues.size() / parallel_thread + 1; for (int i = 0; i < _sender_queues.size(); i += step) { // Create the merger that will a single stream of sorted rows. - std::unique_ptr child_merger(new ChildSortedRunMerger( - less_than, &_row_desc, _profile, mem_tracker, batch_size, false)); + std::unique_ptr child_merger( + new ChildSortedRunMerger(less_than, &_row_desc, _profile, batch_size, false)); vector input_batch_suppliers; for (int j = i; j < std::min((size_t)i + step, _sender_queues.size()); ++j) { input_batch_suppliers.emplace_back(bind(mem_fn(&SenderQueue::get_batch), @@ -420,6 +424,7 @@ void DataStreamRecvr::transfer_all_resources(RowBatch* transfer_batch) { // _child_mergers is not empty, means use parallel merge need transfer resource from // _sender queue. // the need transfer resources from child_merger input_row_batch + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_child_mergers.empty()) { _merger->transfer_all_resources(transfer_batch); } else { @@ -432,10 +437,9 @@ void DataStreamRecvr::transfer_all_resources(RowBatch* transfer_batch) { } DataStreamRecvr::DataStreamRecvr( - DataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, int total_buffer_limit, - RuntimeProfile* profile, + DataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, int num_senders, + bool is_merging, int total_buffer_limit, RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr) : _mgr(stream_mgr), _fragment_instance_id(fragment_instance_id), @@ -446,7 +450,8 @@ DataStreamRecvr::DataStreamRecvr( _num_buffered_bytes(0), _profile(profile), _sub_plan_query_statistics_recvr(sub_plan_query_statistics_recvr) { - _mem_tracker = MemTracker::CreateTracker(_profile, -1, "DataStreamRecvr", parent_tracker); + _mem_tracker = MemTracker::create_tracker(-1, "DataStreamRecvr", nullptr, + MemTrackerLevel::VERBOSE, _profile); // Create one queue per sender if is_merging is true. int num_queues = is_merging ? num_senders : 1; @@ -468,17 +473,20 @@ DataStreamRecvr::DataStreamRecvr( Status DataStreamRecvr::get_next(RowBatch* output_batch, bool* eos) { DCHECK(_merger.get() != nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); return _merger->get_next(output_batch, eos); } void DataStreamRecvr::add_batch(const PRowBatch& batch, int sender_id, int be_number, int64_t packet_seq, ::google::protobuf::Closure** done) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; // Add all batches to the same queue if _is_merging is false. _sender_queues[use_sender_id]->add_batch(batch, be_number, packet_seq, done); } void DataStreamRecvr::add_batch(RowBatch* batch, int sender_id, bool use_move) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->add_batch(batch, use_move); } @@ -495,6 +503,7 @@ void DataStreamRecvr::cancel_stream() { } void DataStreamRecvr::close() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int i = 0; i < _sender_queues.size(); ++i) { _sender_queues[i]->close(); } @@ -503,8 +512,6 @@ void DataStreamRecvr::close() { _mgr->deregister_recvr(fragment_instance_id(), dest_node_id()); _mgr = nullptr; _merger.reset(); - // TODO: Maybe shared tracker doesn't need to be reset manually - _mem_tracker.reset(); } DataStreamRecvr::~DataStreamRecvr() { @@ -514,6 +521,7 @@ DataStreamRecvr::~DataStreamRecvr() { Status DataStreamRecvr::get_batch(RowBatch** next_batch) { DCHECK(!_is_merging); DCHECK_EQ(_sender_queues.size(), 1); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); return _sender_queues[0]->get_batch(next_batch); } diff --git a/be/src/runtime/data_stream_recvr.h b/be/src/runtime/data_stream_recvr.h index 3e4806c6c28ca2..9bc084d6462b6a 100644 --- a/be/src/runtime/data_stream_recvr.h +++ b/be/src/runtime/data_stream_recvr.h @@ -88,8 +88,7 @@ class DataStreamRecvr { // queues. The exprs used in less_than must have already been prepared and opened. Status create_merger(const TupleRowComparator& less_than); - Status create_parallel_merger(const TupleRowComparator& less_than, uint32_t batch_size, - MemTracker* mem_tracker); + Status create_parallel_merger(const TupleRowComparator& less_than, uint32_t batch_size); // Fill output_batch with the next batch of rows obtained by merging the per-sender // input streams. Must only be called if _is_merging is true. Status get_next(RowBatch* output_batch, bool* eos); @@ -101,7 +100,6 @@ class DataStreamRecvr { const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } PlanNodeId dest_node_id() const { return _dest_node_id; } const RowDescriptor& row_desc() const { return _row_desc; } - std::shared_ptr mem_tracker() const { return _mem_tracker; } void add_sub_plan_statistics(const PQueryStatistics& statistics, int sender_id) { _sub_plan_query_statistics_recvr->insert(statistics, sender_id); @@ -115,10 +113,9 @@ class DataStreamRecvr { friend class DataStreamMgr; class SenderQueue; - DataStreamRecvr(DataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, - int total_buffer_limit, RuntimeProfile* profile, + DataStreamRecvr(DataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, int num_senders, + bool is_merging, int total_buffer_limit, RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr); // If receive queue is full, done is enqueue pending, and return with *done is nullptr diff --git a/be/src/runtime/data_stream_sender.cpp b/be/src/runtime/data_stream_sender.cpp index 681f5fc20b0db1..4b311533ecf65c 100644 --- a/be/src/runtime/data_stream_sender.cpp +++ b/be/src/runtime/data_stream_sender.cpp @@ -39,6 +39,7 @@ #include "runtime/raw_value.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/tuple_row.h" #include "service/backend_options.h" #include "service/brpc.h" @@ -89,7 +90,7 @@ Status DataStreamSender::Channel::init(RuntimeState* state) { // TODO: figure out how to size _batch int capacity = std::max(1, _buffer_size / std::max(_row_desc.get_row_size(), 1)); - _batch.reset(new RowBatch(_row_desc, capacity, _parent->_mem_tracker.get())); + _batch.reset(new RowBatch(_row_desc, capacity)); if (_brpc_dest_addr.hostname.empty()) { LOG(WARNING) << "there is no brpc destination address's hostname" @@ -388,9 +389,10 @@ Status DataStreamSender::prepare(RuntimeState* state) { << "])"; _profile = _pool->add(new RuntimeProfile(title.str())); SCOPED_TIMER(_profile->total_time_counter()); - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "DataStreamSender:" + print_id(state->fragment_instance_id()), - state->instance_mem_tracker()); + _mem_tracker = MemTracker::create_tracker( + -1, "DataStreamSender:" + print_id(state->fragment_instance_id()), + state->instance_mem_tracker(), MemTrackerLevel::VERBOSE, _profile); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM) { std::random_device rd; @@ -430,6 +432,7 @@ DataStreamSender::~DataStreamSender() { } Status DataStreamSender::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(state != nullptr); RETURN_IF_ERROR(Expr::open(_partition_expr_ctxs, state)); for (auto iter : _partition_infos) { @@ -439,6 +442,7 @@ Status DataStreamSender::open(RuntimeState* state) { } Status DataStreamSender::send(RuntimeState* state, RowBatch* batch) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); // Unpartition or _channel size @@ -642,6 +646,7 @@ Status DataStreamSender::close(RuntimeState* state, Status exec_status) { // make all channels close parallel if (_closed) return Status::OK(); _closed = true; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); Status final_st = Status::OK(); for (int i = 0; i < _channels.size(); ++i) { Status st = _channels[i]->close(state); diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc index 9b9b350fee8e29..f575d17df7da07 100644 --- a/be/src/runtime/disk_io_mgr.cc +++ b/be/src/runtime/disk_io_mgr.cc @@ -20,6 +20,8 @@ #include #include "runtime/disk_io_mgr_internal.h" +#include "runtime/exec_env.h" +#include "runtime/thread_context.h" using std::string; using std::stringstream; @@ -198,7 +200,10 @@ string DiskIoMgr::debug_string() { } DiskIoMgr::BufferDescriptor::BufferDescriptor(DiskIoMgr* io_mgr) - : _io_mgr(io_mgr), _reader(nullptr), _buffer(nullptr) {} + : _io_mgr(io_mgr), + _reader(nullptr), + _buffer(nullptr), + _mem_tracker(io_mgr->cached_buffers_mem_tracker()) {} void DiskIoMgr::BufferDescriptor::reset(RequestContext* reader, ScanRange* range, char* buffer, int64_t buffer_len) { @@ -229,14 +234,8 @@ void DiskIoMgr::BufferDescriptor::set_mem_tracker(std::shared_ptr tr if (_mem_tracker.get() == tracker.get()) { return; } - // TODO(yingchun): use TransferTo? - if (_mem_tracker != nullptr) { - _mem_tracker->Release(_buffer_len); - } + _mem_tracker->transfer_to(tracker, _buffer_len); _mem_tracker = std::move(tracker); - if (_mem_tracker != nullptr) { - _mem_tracker->Consume(_buffer_len); - } } DiskIoMgr::WriteRange::WriteRange(const string& file, int64_t file_offset, int disk_id, @@ -275,6 +274,9 @@ DiskIoMgr::DiskIoMgr() // std::min((uint64_t)config::max_cached_file_handles, FileSystemUtil::max_num_file_handles()), // &HdfsCachedFileHandle::release) { { + _mem_tracker = + MemTracker::create_tracker(-1, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size); _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1); int num_local_disks = (config::num_disks == 0 ? DiskInfo::num_disks() : config::num_disks); @@ -295,6 +297,9 @@ DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_s // _file_handle_cache(::min(config::max_cached_file_handles, // FileSystemUtil::max_num_file_handles()), &HdfsCachedFileHandle::release) { { + _mem_tracker = + MemTracker::create_tracker(-1, "DiskIO", nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size); _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1); if (num_local_disks == 0) { @@ -305,6 +310,7 @@ DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_s } DiskIoMgr::~DiskIoMgr() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _shut_down = true; // Notify all worker threads and shut them down. for (int i = 0; i < _disk_queues.size(); ++i) { @@ -359,14 +365,15 @@ DiskIoMgr::~DiskIoMgr() { */ } -Status DiskIoMgr::init(const std::shared_ptr& process_mem_tracker) { - DCHECK(process_mem_tracker != nullptr); - _process_mem_tracker = process_mem_tracker; +Status DiskIoMgr::init(const int64_t mem_limit) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + _mem_tracker->set_limit(mem_limit); + _cached_buffers_mem_tracker = MemTracker::create_tracker( + mem_limit, "DiskIO:CachedBuffers", _mem_tracker, MemTrackerLevel::OVERVIEW); // If we hit the process limit, see if we can reclaim some memory by removing // previously allocated (but unused) io buffers. - /* - * process_mem_tracker->AddGcFunction(bind(&DiskIoMgr::gc_io_buffers, this)); - */ + MemTracker::get_process_tracker()->add_gc_function( + std::bind(&DiskIoMgr::gc_io_buffers, this, std::placeholders::_1)); for (int i = 0; i < _disk_queues.size(); ++i) { _disk_queues[i] = new DiskQueue(i); @@ -387,7 +394,7 @@ Status DiskIoMgr::init(const std::shared_ptr& process_mem_tracker) { // _disk_thread_group.AddThread(new Thread("disk-io-mgr", ss.str(), // &DiskIoMgr::work_loop, this, _disk_queues[i])); _disk_thread_group.add_thread( - new std::thread(std::bind(&DiskIoMgr::work_loop, this, _disk_queues[i]))); + new std::thread(std::bind(&DiskIoMgr::work_loop, this, _disk_queues[i], _mem_tracker))); } } _request_context_cache.reset(new RequestContextCache(this)); @@ -446,6 +453,7 @@ void DiskIoMgr::unregister_context(RequestContext* reader) { // is on. // If wait_for_disks_completion is true, wait for the number of active disks to become 0. void DiskIoMgr::cancel_context(RequestContext* context, bool wait_for_disks_completion) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); context->cancel(Status::Cancelled("Cancelled")); if (wait_for_disks_completion) { @@ -523,6 +531,7 @@ Status DiskIoMgr::validate_scan_range(ScanRange* range) { Status DiskIoMgr::add_scan_ranges(RequestContext* reader, const vector& ranges, bool schedule_immediately) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (ranges.empty()) { return Status::OK(); } @@ -572,6 +581,7 @@ Status DiskIoMgr::add_scan_ranges(RequestContext* reader, const vector_status.ok()) { DCHECK(buffer_desc->_buffer == nullptr); @@ -713,9 +725,10 @@ char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) { char* buffer = nullptr; if (_free_buffers[idx].empty()) { ++_num_allocated_buffers; - // Update the process mem usage. This is checked the next time we start + // Update the disk io mem usage. This is checked the next time we start // a read for the next reader (DiskIoMgr::GetNextScanRange) - _process_mem_tracker->Consume(*buffer_size); + _cached_buffers_mem_tracker->consume(*buffer_size); + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); buffer = new char[*buffer_size]; } else { buffer = _free_buffers[idx].front(); @@ -725,28 +738,34 @@ char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) { return buffer; } -void DiskIoMgr::gc_io_buffers() { +void DiskIoMgr::gc_io_buffers(int64_t bytes_to_free) { unique_lock lock(_free_buffers_lock); int buffers_freed = 0; int bytes_freed = 0; for (int idx = 0; idx < _free_buffers.size(); ++idx) { - for (list::iterator iter = _free_buffers[idx].begin(); - iter != _free_buffers[idx].end(); ++iter) { - int64_t buffer_size = (1 << idx) * _min_buffer_size; - _process_mem_tracker->Release(buffer_size); - --_num_allocated_buffers; - delete[] * iter; - - ++buffers_freed; - bytes_freed += buffer_size; + { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + for (list::iterator iter = _free_buffers[idx].begin(); + iter != _free_buffers[idx].end(); ++iter) { + int64_t buffer_size = (1 << idx) * _min_buffer_size; + _cached_buffers_mem_tracker->release(buffer_size); + --_num_allocated_buffers; + delete[] * iter; + + ++buffers_freed; + bytes_freed += buffer_size; + } } _free_buffers[idx].clear(); + if (bytes_freed >= bytes_to_free) { + break; + } } } void DiskIoMgr::return_free_buffer(BufferDescriptor* desc) { return_free_buffer(desc->_buffer, desc->_buffer_len); - desc->set_mem_tracker(nullptr); + desc->set_mem_tracker(_cached_buffers_mem_tracker); desc->_buffer = nullptr; } @@ -760,8 +779,9 @@ void DiskIoMgr::return_free_buffer(char* buffer, int64_t buffer_size) { if (!config::disable_mem_pools && _free_buffers[idx].size() < config::max_free_io_buffers) { _free_buffers[idx].push_back(buffer); } else { - _process_mem_tracker->Release(buffer_size); + _cached_buffers_mem_tracker->release(buffer_size); --_num_allocated_buffers; + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); delete[] buffer; } } @@ -817,15 +837,9 @@ bool DiskIoMgr::get_next_request_range(DiskQueue* disk_queue, RequestRange** ran // We just picked a reader, check the mem limits. // TODO: we can do a lot better here. The reader can likely make progress // with fewer io buffers. - bool process_limit_exceeded = _process_mem_tracker->limit_exceeded(); - bool reader_limit_exceeded = - (*request_context)->_mem_tracker != nullptr - ? (*request_context)->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) - : false; - // bool reader_limit_exceeded = (*request_context)->_mem_tracker != nullptr - // ? (*request_context)->_mem_tracker->limit_exceeded() : false; - - if (process_limit_exceeded || reader_limit_exceeded) { + if ((*request_context)->_mem_tracker != nullptr + ? (*request_context)->_mem_tracker->any_limit_exceeded() + : false) { (*request_context)->cancel(Status::MemoryLimitExceeded("Memory limit exceeded")); } @@ -977,7 +991,7 @@ void DiskIoMgr::handle_read_finished(DiskQueue* disk_queue, RequestContext* read state.decrement_request_thread(); } -void DiskIoMgr::work_loop(DiskQueue* disk_queue) { +void DiskIoMgr::work_loop(DiskQueue* disk_queue, std::shared_ptr mem_tracker) { // The thread waits until there is work or the entire system is being shut down. // If there is work, performs the read or write requested and re-enqueues the // requesting context. @@ -989,6 +1003,7 @@ void DiskIoMgr::work_loop(DiskQueue* disk_queue) { // re-enqueues the request. // 3. Perform the read or write as specified. // Cancellation checking needs to happen in both steps 1 and 3. + SCOPED_ATTACH_TASK_THREAD_2ARG(ThreadContext::TaskType::QUERY, mem_tracker); while (!_shut_down) { RequestContext* worker_context = nullptr; ; @@ -1017,17 +1032,8 @@ void DiskIoMgr::read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRa int64_t bytes_remaining = range->_len - range->_bytes_read; DCHECK_GT(bytes_remaining, 0); int64_t buffer_size = std::min(bytes_remaining, static_cast(_max_buffer_size)); - bool enough_memory = true; - if (reader->_mem_tracker != nullptr) { - enough_memory = reader->_mem_tracker->SpareCapacity(MemLimit::HARD) > LOW_MEMORY; - if (!enough_memory) { - // Low memory, GC and try again. - gc_io_buffers(); - enough_memory = reader->_mem_tracker->SpareCapacity(MemLimit::HARD) > LOW_MEMORY; - } - } - if (!enough_memory) { + if (reader->_mem_tracker != nullptr && reader->_mem_tracker->spare_capacity() <= LOW_MEMORY) { RequestContext::PerDiskState& state = reader->_disk_states[disk_queue->disk_id]; unique_lock reader_lock(reader->_lock); @@ -1151,6 +1157,7 @@ int DiskIoMgr::free_buffers_idx(int64_t buffer_size) { } Status DiskIoMgr::add_write_range(RequestContext* writer, WriteRange* write_range) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK_LE(write_range->len(), _max_buffer_size); unique_lock writer_lock(writer->_lock); diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h index af988fb73ee067..9b716abdae6e7d 100644 --- a/be/src/runtime/disk_io_mgr.h +++ b/be/src/runtime/disk_io_mgr.h @@ -262,9 +262,6 @@ class DiskIoMgr { // Reader that this buffer is for RequestContext* _reader; - // The current tracker this buffer is associated with. - std::shared_ptr _mem_tracker; - // Scan range that this buffer is for. ScanRange* _scan_range; @@ -284,6 +281,9 @@ class DiskIoMgr { Status _status; int64_t _scan_range_offset; + + // The current tracker this buffer is associated with. + std::shared_ptr _mem_tracker; }; // The request type, read or write associated with a request range. @@ -542,7 +542,7 @@ class DiskIoMgr { ~DiskIoMgr(); // Initialize the IoMgr. Must be called once before any of the other APIs. - Status init(const std::shared_ptr& process_mem_tracker); + Status init(const int64_t mem_limit); // Allocates tracking structure for a request context. // Register a new request context which is returned in *request_context. @@ -657,6 +657,10 @@ class DiskIoMgr { // Returns the number of buffers currently owned by all readers. int num_buffers_in_readers() const { return _num_buffers_in_readers; } + std::shared_ptr cached_buffers_mem_tracker() const { + return _cached_buffers_mem_tracker; + } + // Dumps the disk IoMgr queues (for readers and disks) std::string debug_string(); @@ -691,8 +695,9 @@ class DiskIoMgr { // Pool to allocate BufferDescriptors. ObjectPool _pool; - // Process memory tracker; needed to account for io buffers. - std::shared_ptr _process_mem_tracker; + std::shared_ptr _mem_tracker; + // account for io buffers. + std::shared_ptr _cached_buffers_mem_tracker; // Number of worker(read) threads per disk. Also the max depth of queued // work to the disk. @@ -787,10 +792,9 @@ class DiskIoMgr { char* get_free_buffer(int64_t* buffer_size); // Garbage collect all unused io buffers. This is currently only triggered when the - // process wide limit is hit. This is not good enough. While it is sufficient for - // the IoMgr, other components do not trigger this GC. + // process wide limit is hit. // TODO: make this run periodically? - void gc_io_buffers(); + void gc_io_buffers(int64_t bytes_to_free = INT_MAX); // Returns a buffer to the free list. buffer_size / _min_buffer_size should be a power // of 2, and buffer_size should be <= _max_buffer_size. These constraints will be met @@ -804,7 +808,7 @@ class DiskIoMgr { // Disk worker thread loop. This function retrieves the next range to process on // the disk queue and invokes read_range() or Write() depending on the type of Range(). // There can be multiple threads per disk running this loop. - void work_loop(DiskQueue* queue); + void work_loop(DiskQueue* queue, std::shared_ptr mem_tracker); // This is called from the disk thread to get the next range to process. It will // wait until a scan range and buffer are available, or a write range is available. diff --git a/be/src/runtime/dpp_sink.cpp b/be/src/runtime/dpp_sink.cpp index 1321a6a1b91876..6605c75269f919 100644 --- a/be/src/runtime/dpp_sink.cpp +++ b/be/src/runtime/dpp_sink.cpp @@ -557,8 +557,7 @@ Status Translator::prepare(RuntimeState* state) { RETURN_IF_ERROR(create_writer(state)); // 4. new batch for writer - _batch_to_write.reset( - new RowBatch(_row_desc, state->batch_size(), state->instance_mem_tracker().get())); + _batch_to_write.reset(new RowBatch(_row_desc, state->batch_size())); if (_batch_to_write.get() == nullptr) { return Status::InternalError("No memory to allocate RowBatch."); } @@ -795,7 +794,7 @@ Status Translator::process(RuntimeState* state) { SCOPED_TIMER(_agg_timer); bool eos = false; while (!eos) { - RowBatch batch(_row_desc, state->batch_size(), state->instance_mem_tracker().get()); + RowBatch batch(_row_desc, state->batch_size()); RETURN_IF_ERROR(_sorter->get_next(&batch, &eos)); diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 39808bf8704201..a2d51625fdb645 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -20,6 +20,8 @@ #include "common/status.h" #include "olap/options.h" +#include "runtime/mem_tracker.h" +#include "runtime/mem_tracker_task_pool.h" #include "util/threadpool.h" namespace doris { @@ -45,7 +47,7 @@ class LoadPathMgr; class LoadStreamMgr; class MemTracker; class StorageEngine; -class PoolMemTrackerRegistry; +class MemTrackerTaskPool; class PriorityThreadPool; class ReservationTracker; class ResultBufferMgr; @@ -58,6 +60,9 @@ class WebPageHandler; class StreamLoadExecutor; class RoutineLoadTaskExecutor; class SmallFileMgr; +class ThreadContext; + +static std::vector free_thread_ctx; class BackendServiceClient; class FrontendServiceClient; @@ -96,6 +101,7 @@ class ExecEnv { // declarations for classes in scoped_ptrs. ~ExecEnv(); + const bool initialized() { return _is_init; } const std::string& token() const; ExternalScanContextMgr* external_scan_context_mgr() { return _external_scan_context_mgr; } DataStreamMgr* stream_mgr() { return _stream_mgr; } @@ -115,8 +121,16 @@ class ExecEnv { return nullptr; } - std::shared_ptr process_mem_tracker() { return _mem_tracker; } - PoolMemTrackerRegistry* pool_mem_trackers() { return _pool_mem_trackers; } + // std::shared_ptr process_mem_tracker() { return _process_mem_tracker; } + // MemTracker* process_mem_tracker_raw() { return _process_mem_tracker.get(); } + std::shared_ptr new_process_mem_tracker() { return _new_process_mem_tracker; } + std::shared_ptr query_pool_mem_tracker() { return _query_pool_mem_tracker; } + std::shared_ptr load_pool_mem_tracker() { return _load_pool_mem_tracker; } + MemTrackerTaskPool* task_pool_mem_tracker_registry() { + return _task_pool_mem_tracker_registry.get(); + } + std::vector free_thread_ctx() { return _free_thread_ctx; } + // ThreadContext* get_thread_local_ctx() { return thread_local_ctx(); } ThreadResourceMgr* thread_mgr() { return _thread_mgr; } PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; } ThreadPool* limited_scan_thread_pool() { return _limited_scan_thread_pool.get(); } @@ -153,9 +167,6 @@ class ExecEnv { RoutineLoadTaskExecutor* routine_load_task_executor() { return _routine_load_task_executor; } HeartbeatFlags* heartbeat_flags() { return _heartbeat_flags; } - // The root tracker should be set before calling ExecEnv::init(); - void set_root_mem_tracker(std::shared_ptr root_tracker); - private: Status _init(const std::vector& store_paths); void _destroy(); @@ -180,10 +191,20 @@ class ExecEnv { ClientCache* _frontend_client_cache = nullptr; ClientCache* _broker_client_cache = nullptr; ClientCache* _extdatasource_client_cache = nullptr; - std::shared_ptr _mem_tracker; - PoolMemTrackerRegistry* _pool_mem_trackers = nullptr; ThreadResourceMgr* _thread_mgr = nullptr; + // The ancestor of all trackers in the process. It is the only child of the root tracker. + // All manually created trackers should specify the process tracker as the parent. + // std::shared_ptr _process_mem_tracker = nullptr; + std::shared_ptr _new_process_mem_tracker = nullptr; + // The ancestor for all querys tracker. + std::shared_ptr _query_pool_mem_tracker = nullptr; + // The ancestor for all load tracker. + std::shared_ptr _load_pool_mem_tracker = nullptr; + std::unique_ptr _task_pool_mem_tracker_registry; + + std::vector _free_thread_ctx; + // The following two thread pools are used in different scenarios. // _scan_thread_pool is a priority thread pool. // Scanner threads for common queries will use this thread pool, diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index eb29e5eaae3048..7f75c6b2d111bc 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -44,6 +44,7 @@ #include "runtime/load_channel_mgr.h" #include "runtime/load_path_mgr.h" #include "runtime/mem_tracker.h" +#include "runtime/mem_tracker_task_pool.h" #include "runtime/result_buffer_mgr.h" #include "runtime/result_queue_mgr.h" #include "runtime/routine_load/routine_load_task_executor.h" @@ -72,6 +73,8 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(send_batch_thread_pool_thread_num, MetricUnit DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(send_batch_thread_pool_queue_size, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(query_mem_consumption, MetricUnit::BYTES, "", mem_consumption, Labels({{"type", "query"}})); +DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_mem_consumption, MetricUnit::BYTES, "", mem_consumption, + Labels({{"type", "load"}})); Status ExecEnv::init(ExecEnv* env, const std::vector& store_paths) { return env->_init(store_paths); @@ -93,7 +96,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _broker_client_cache = new BrokerServiceClientCache(config::max_client_cache_size_per_host); _extdatasource_client_cache = new ExtDataSourceServiceClientCache(config::max_client_cache_size_per_host); - _pool_mem_trackers = new PoolMemTrackerRegistry(); + _task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); _thread_mgr = new ThreadResourceMgr(); _scan_thread_pool = new PriorityThreadPool(config::doris_scanner_thread_pool_thread_num, config::doris_scanner_thread_pool_queue_size); @@ -146,7 +149,7 @@ Status ExecEnv::_init(const std::vector& store_paths) { _small_file_mgr->init(); _init_mem_tracker(); - RETURN_IF_ERROR(_load_channel_mgr->init(_mem_tracker->limit())); + RETURN_IF_ERROR(_load_channel_mgr->init(MemTracker::get_process_tracker()->limit())); _heartbeat_flags = new HeartbeatFlags(); _register_metrics(); _is_init = true; @@ -173,10 +176,22 @@ Status ExecEnv::_init_mem_tracker() { << ". Using physical memory instead"; global_memory_limit_bytes = MemInfo::physical_mem(); } - _mem_tracker = MemTracker::CreateTracker(global_memory_limit_bytes, "Process", - MemTracker::GetRootTracker(), false, false, - MemTrackerLevel::OVERVIEW); - REGISTER_HOOK_METRIC(query_mem_consumption, [this]() { return _mem_tracker->consumption(); }); + MemTracker::get_process_tracker()->set_limit(global_memory_limit_bytes); + // _process_mem_tracker = + // MemTracker::create_tracker(global_memory_limit_bytes, "Process", + // MemTracker::get_root_tracker(), MemTrackerLevel::OVERVIEW); + _new_process_mem_tracker = + MemTracker::create_virtual_tracker(global_memory_limit_bytes, "NewProcess", + nullptr, MemTrackerLevel::OVERVIEW); + _query_pool_mem_tracker = + MemTracker::create_tracker(global_memory_limit_bytes, "QueryPool", MemTracker::get_process_tracker(), + MemTrackerLevel::OVERVIEW); + REGISTER_HOOK_METRIC(query_mem_consumption, + [this]() { return _query_pool_mem_tracker->consumption(); }); + _load_pool_mem_tracker = MemTracker::create_tracker( + global_memory_limit_bytes, "LoadPool", MemTracker::get_process_tracker(), MemTrackerLevel::OVERVIEW); + REGISTER_HOOK_METRIC(load_mem_consumption, + [this]() { return _load_pool_mem_tracker->consumption(); }); LOG(INFO) << "Using global memory limit: " << PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES) << ", origin config value: " << config::mem_limit; @@ -241,7 +256,7 @@ Status ExecEnv::_init_mem_tracker() { SegmentLoader::create_global_instance(config::segment_cache_capacity); // 4. init other managers - RETURN_IF_ERROR(_disk_io_mgr->init(_mem_tracker)); + RETURN_IF_ERROR(_disk_io_mgr->init(global_memory_limit_bytes)); RETURN_IF_ERROR(_tmp_file_mgr->init()); // TODO(zc): The current memory usage configuration is a bit confusing, @@ -300,7 +315,6 @@ void ExecEnv::_destroy() { SAFE_DELETE(_etl_thread_pool); SAFE_DELETE(_scan_thread_pool); SAFE_DELETE(_thread_mgr); - SAFE_DELETE(_pool_mem_trackers); SAFE_DELETE(_broker_client_cache); SAFE_DELETE(_extdatasource_client_cache); SAFE_DELETE(_frontend_client_cache); @@ -314,6 +328,7 @@ void ExecEnv::_destroy() { SAFE_DELETE(_heartbeat_flags); DEREGISTER_HOOK_METRIC(query_mem_consumption); + DEREGISTER_HOOK_METRIC(load_mem_consumption); _is_init = false; } diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp index 9cc9f4c293792f..37cb719117e60c 100644 --- a/be/src/runtime/export_sink.cpp +++ b/be/src/runtime/export_sink.cpp @@ -28,7 +28,6 @@ #include "exprs/expr.h" #include "exprs/expr_context.h" #include "gutil/strings/numbers.h" -#include "runtime/mem_tracker.h" #include "runtime/mysql_table_sink.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" @@ -72,10 +71,8 @@ Status ExportSink::prepare(RuntimeState* state) { _profile = state->obj_pool()->add(new RuntimeProfile(title.str())); SCOPED_TIMER(_profile->total_time_counter()); - _mem_tracker = MemTracker::CreateTracker(-1, "ExportSink", state->instance_mem_tracker()); - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker)); + RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _expr_mem_tracker)); // TODO(lingbin): add some Counter _bytes_written_counter = ADD_COUNTER(profile(), "BytesExported", TUnit::BYTES); diff --git a/be/src/runtime/export_sink.h b/be/src/runtime/export_sink.h index c6bb7fe7cd3f59..7f46136c2d7a05 100644 --- a/be/src/runtime/export_sink.h +++ b/be/src/runtime/export_sink.h @@ -31,7 +31,6 @@ class TExpr; class RuntimeState; class RuntimeProfile; class ExprContext; -class MemTracker; class FileWriter; class TupleRow; @@ -75,8 +74,6 @@ class ExportSink : public DataSink { RuntimeProfile* _profile; - std::shared_ptr _mem_tracker; - RuntimeProfile::Counter* _bytes_written_counter; RuntimeProfile::Counter* _rows_written_counter; RuntimeProfile::Counter* _write_timer; diff --git a/be/src/runtime/fold_constant_executor.cpp b/be/src/runtime/fold_constant_executor.cpp index f093c04235ee93..5c6d3ace55c1f3 100644 --- a/be/src/runtime/fold_constant_executor.cpp +++ b/be/src/runtime/fold_constant_executor.cpp @@ -24,6 +24,7 @@ #include "runtime/runtime_state.h" #include "runtime/mem_tracker.h" #include "exprs/expr_context.h" +#include "runtime/thread_context.h" #include "exprs/expr.h" #include "common/object_pool.h" #include "common/status.h" @@ -43,6 +44,7 @@ TUniqueId FoldConstantExecutor::_dummy_id; Status FoldConstantExecutor::fold_constant_expr( const TFoldConstantParams& params, PConstantExprResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); const auto& expr_map = params.expr_map; auto expr_result_map = response->mutable_expr_result_map(); @@ -50,7 +52,6 @@ Status FoldConstantExecutor::fold_constant_expr( // init Status status = _init(query_globals); if (UNLIKELY(!status.ok())) { - LOG(WARNING) << "Failed to init mem trackers, msg: " << status.get_error_msg(); return status; } @@ -64,7 +65,6 @@ Status FoldConstantExecutor::fold_constant_expr( // prepare and open context status = _prepare_and_open(ctx); if (UNLIKELY(!status.ok())) { - LOG(WARNING) << "Failed to init mem trackers, msg: " << status.get_error_msg(); return status; } @@ -188,8 +188,8 @@ Status FoldConstantExecutor::_init(const TQueryGlobals& query_globals) { _runtime_profile = _runtime_state->runtime_profile(); _runtime_profile->set_name("FoldConstantExpr"); - _mem_tracker = MemTracker::CreateTracker(-1, "FoldConstantExpr", _runtime_state->instance_mem_tracker()); - _mem_pool.reset(new MemPool(_mem_tracker.get())); + _mem_tracker = MemTracker::create_tracker(-1, "FoldConstantExpr", _runtime_state->instance_mem_tracker()); + _mem_pool.reset(new MemPool(_mem_tracker)); return Status::OK(); } diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index dcff85c2d5a2df..74c819abc9ea35 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -43,6 +43,7 @@ #include "runtime/stream_load/load_stream_mgr.h" #include "runtime/stream_load/stream_load_context.h" #include "runtime/stream_load/stream_load_pipe.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/debug_util.h" #include "util/doris_metrics.h" @@ -82,6 +83,7 @@ class FragmentExecState { FragmentExecState(const TUniqueId& query_id, const TUniqueId& instance_id, int backend_num, ExecEnv* exec_env, const TNetworkAddress& coord_addr); + ~FragmentExecState(); Status prepare(const TExecPlanFragmentParams& params); @@ -94,7 +96,7 @@ class FragmentExecState { Status cancel_before_execute(); - Status cancel(const PPlanFragmentCancelReason& reason); + Status cancel(const PPlanFragmentCancelReason& reason, const std::string& msg = ""); TUniqueId fragment_instance_id() const { return _fragment_instance_id; } @@ -134,6 +136,10 @@ class FragmentExecState { return false; } + bool is_canceling() const { return _is_canceling; } + + void set_is_canceling() { _is_canceling = true; } + int get_timeout_second() const { return _timeout_second; } std::shared_ptr get_fragments_ctx() { return _fragments_ctx; } @@ -155,6 +161,7 @@ class FragmentExecState { PlanFragmentExecutor _executor; DateTimeValue _start_time; + bool _is_canceling = false; std::mutex _status_lock; Status _exec_status; @@ -165,6 +172,7 @@ class FragmentExecState { int _timeout_second; + std::unique_ptr _exec_thread; // This context is shared by all fragments of this host in a query std::shared_ptr _fragments_ctx; @@ -207,6 +215,7 @@ FragmentExecState::FragmentExecState(const TUniqueId& query_id, _start_time = DateTimeValue::local_time(); } +FragmentExecState::~FragmentExecState() {} Status FragmentExecState::prepare(const TExecPlanFragmentParams& params) { if (params.__isset.query_options) { @@ -242,6 +251,11 @@ Status FragmentExecState::execute() { Status FragmentExecState::cancel_before_execute() { // set status as 'abort', cuz cancel() won't effect the status arg of DataSink::close(). + // TODO(zxy) 2ARG + SCOPED_ATTACH_TASK_THREAD_4ARG(executor()->runtime_state()->query_type(), + print_id(query_id()), + fragment_instance_id(), + executor()->runtime_state()->instance_mem_tracker()); _executor.set_abort(); _executor.cancel(); if (_pipe != nullptr) { @@ -250,13 +264,13 @@ Status FragmentExecState::cancel_before_execute() { return Status::OK(); } -Status FragmentExecState::cancel(const PPlanFragmentCancelReason& reason) { +Status FragmentExecState::cancel(const PPlanFragmentCancelReason& reason, const std::string& msg) { std::lock_guard l(_status_lock); RETURN_IF_ERROR(_exec_status); if (reason == PPlanFragmentCancelReason::LIMIT_REACH) { _executor.set_is_report_on_cancel(false); } - _executor.cancel(); + _executor.cancel(reason, msg); if (_pipe != nullptr) { _pipe->cancel(PPlanFragmentCancelReason_Name(reason)); } @@ -301,12 +315,12 @@ void FragmentExecState::coordinator_callback(const Status& status, RuntimeProfil RuntimeState* runtime_state = _executor.runtime_state(); DCHECK(runtime_state != nullptr); - if (runtime_state->query_options().query_type == TQueryType::LOAD && !done && status.ok()) { + if (runtime_state->query_type() == TQueryType::LOAD && !done && status.ok()) { // this is a load plan, and load is not finished, just make a brief report params.__set_loaded_rows(runtime_state->num_rows_load_total()); params.__set_loaded_bytes(runtime_state->num_bytes_load_total()); } else { - if (runtime_state->query_options().query_type == TQueryType::LOAD) { + if (runtime_state->query_type() == TQueryType::LOAD) { params.__set_loaded_rows(runtime_state->num_rows_load_total()); params.__set_loaded_bytes(runtime_state->num_bytes_load_total()); } @@ -461,6 +475,10 @@ void FragmentMgr::_exec_actual(std::shared_ptr exec_state, Fi .query_id(exec_state->query_id()) .instance_id(exec_state->fragment_instance_id()) .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); + SCOPED_ATTACH_TASK_THREAD_4ARG(exec_state->executor()->runtime_state()->query_type(), + print_id(exec_state->query_id()), + exec_state->fragment_instance_id(), + exec_state->executor()->runtime_state()->instance_mem_tracker()); exec_state->execute(); std::shared_ptr fragments_ctx = exec_state->get_fragments_ctx(); @@ -643,7 +661,8 @@ Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params, Fi return Status::OK(); } -Status FragmentMgr::cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason) { +Status FragmentMgr::cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason, + const std::string& msg) { std::shared_ptr exec_state; { std::lock_guard lock(_lock); @@ -653,12 +672,30 @@ Status FragmentMgr::cancel(const TUniqueId& fragment_id, const PPlanFragmentCanc return Status::OK(); } exec_state = iter->second; + exec_state->set_is_canceling(); } - exec_state->cancel(reason); + exec_state->cancel(reason, msg); return Status::OK(); } +Status FragmentMgr::is_canceling(const TUniqueId& fragment_id) { + std::shared_ptr exec_state; + { + std::lock_guard lock(_lock); + auto iter = _fragment_map.find(fragment_id); + if (iter != _fragment_map.end()) { + exec_state = iter->second; + if (exec_state->is_canceling()) { + return Status::Cancelled("Canceling"); + } else { + return Status::OK(); + } + } + } + return Status::InternalError("FragmentID not found"); +} + void FragmentMgr::cancel_worker() { LOG(INFO) << "FragmentMgr cancel worker start working."; do { diff --git a/be/src/runtime/fragment_mgr.h b/be/src/runtime/fragment_mgr.h index ba562164a2bbb9..8be4255ea8d2f2 100644 --- a/be/src/runtime/fragment_mgr.h +++ b/be/src/runtime/fragment_mgr.h @@ -70,7 +70,10 @@ class FragmentMgr : public RestMonitorIface { return cancel(fragment_id, PPlanFragmentCancelReason::INTERNAL_ERROR); } - Status cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason); + Status cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason, + const std::string& msg = ""); + + Status is_canceling(const TUniqueId& fragment_id); void cancel_worker(); diff --git a/be/src/runtime/free_pool.hpp b/be/src/runtime/free_pool.hpp index 379d2549aef696..6dde1648edf857 100644 --- a/be/src/runtime/free_pool.hpp +++ b/be/src/runtime/free_pool.hpp @@ -40,7 +40,7 @@ namespace doris { // contains the link to the next allocation. // This has O(1) Allocate() and Free(). // This is not thread safe. -// TODO: consider integrating this with MemPool. +// TODO(zxy): consider integrating this with MemPool. // TODO: consider changing to something more granular than doubling. class FreePool { public: diff --git a/be/src/runtime/initial_reservations.cc b/be/src/runtime/initial_reservations.cc index adbc2be09d7883..86b1f2f1b65e6c 100644 --- a/be/src/runtime/initial_reservations.cc +++ b/be/src/runtime/initial_reservations.cc @@ -38,7 +38,7 @@ InitialReservations::InitialReservations(ObjectPool* obj_pool, std::shared_ptr query_mem_tracker, int64_t initial_reservation_total_claims) : initial_reservation_mem_tracker_( - MemTracker::CreateTracker(-1, "InitialReservations", query_mem_tracker, false)), + MemTracker::create_tracker(-1, "InitialReservations", query_mem_tracker)), remaining_initial_reservation_claims_(initial_reservation_total_claims) { initial_reservations_.InitChildTracker(nullptr, query_reservation, initial_reservation_mem_tracker_.get(), @@ -83,7 +83,5 @@ void InitialReservations::Return(BufferPool::ClientHandle* src, int64_t bytes) { void InitialReservations::ReleaseResources() { initial_reservations_.Close(); - // TODO(HW): Close() is private. make this tracker shared later - // initial_reservation_mem_tracker_->Close(); } } // namespace doris diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp index ee33cc3fdbbc07..3e961ecfde6f3a 100644 --- a/be/src/runtime/load_channel.cpp +++ b/be/src/runtime/load_channel.cpp @@ -19,17 +19,17 @@ #include "olap/lru_cache.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "runtime/tablets_channel.h" namespace doris { LoadChannel::LoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip) + bool is_high_priority, const std::string& sender_ip) : _load_id(load_id), _timeout_s(timeout_s), _is_high_priority(is_high_priority), _sender_ip(sender_ip) { - _mem_tracker = MemTracker::CreateTracker( - mem_limit, "LoadChannel:" + _load_id.to_string(), mem_tracker, true, false, MemTrackerLevel::TASK); + _mem_tracker = MemTracker::create_tracker( + mem_limit, "LoadChannel:" + _load_id.to_string(), nullptr, MemTrackerLevel::TASK); // _last_updated_time should be set before being inserted to // _load_channels in load_channel_mgr, or it may be erased // immediately by gc thread. @@ -43,6 +43,7 @@ LoadChannel::~LoadChannel() { } Status LoadChannel::open(const PTabletWriterOpenRequest& params) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t index_id = params.index_id(); std::shared_ptr channel; { @@ -53,7 +54,7 @@ Status LoadChannel::open(const PTabletWriterOpenRequest& params) { } else { // create a new tablets channel TabletsChannelKey key(params.id(), index_id); - channel.reset(new TabletsChannel(key, _mem_tracker, _is_high_priority)); + channel.reset(new TabletsChannel(key, _is_high_priority)); _tablets_channels.insert({index_id, channel}); } } @@ -67,6 +68,7 @@ Status LoadChannel::open(const PTabletWriterOpenRequest& params) { Status LoadChannel::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t index_id = request.index_id(); // 1. get tablets channel std::shared_ptr channel; @@ -111,6 +113,7 @@ Status LoadChannel::add_batch(const PTabletWriterAddBatchRequest& request, } void LoadChannel::handle_mem_exceed_limit(bool force) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // lock so that only one thread can check mem limit std::lock_guard l(_lock); if (!(force || _mem_tracker->limit_exceeded())) { @@ -145,6 +148,7 @@ bool LoadChannel::_find_largest_consumption_channel(std::shared_ptr l(_lock); for (auto& it : _tablets_channels) { it.second->cancel(); diff --git a/be/src/runtime/load_channel.h b/be/src/runtime/load_channel.h index 13490f5fa847f0..ba0ad3033498c8 100644 --- a/be/src/runtime/load_channel.h +++ b/be/src/runtime/load_channel.h @@ -39,8 +39,7 @@ class TabletsChannel; class LoadChannel { public: LoadChannel(const UniqueId& load_id, int64_t mem_limit, int64_t timeout_s, - const std::shared_ptr& mem_tracker, bool is_high_priority, - const std::string& sender_ip); + bool is_high_priority, const std::string& sender_ip); ~LoadChannel(); // open a new load channel if not exist diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp index baa0e8891c41cf..08eabfbdd7e89c 100644 --- a/be/src/runtime/load_channel_mgr.cpp +++ b/be/src/runtime/load_channel_mgr.cpp @@ -21,6 +21,7 @@ #include "olap/lru_cache.h" #include "runtime/load_channel.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/doris_metrics.h" #include "util/stopwatch.hpp" @@ -28,8 +29,8 @@ namespace doris { DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(load_channel_count, MetricUnit::NOUNIT); -DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_mem_consumption, MetricUnit::BYTES, "", - mem_consumption, Labels({{"type", "load"}})); +DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_channel_mem_consumption, MetricUnit::BYTES, "", mem_consumption, + Labels({{"type", "load"}})); // Calculate the total memory limit of all load tasks on this BE static int64_t calc_process_max_load_memory(int64_t process_mem_limit) { @@ -70,12 +71,11 @@ LoadChannelMgr::LoadChannelMgr() : _stop_background_threads_latch(1) { std::lock_guard l(_lock); return _load_channels.size(); }); - _last_success_channel = new_lru_cache("LastestSuccessChannelCache", 1024, _mem_tracker); } LoadChannelMgr::~LoadChannelMgr() { DEREGISTER_HOOK_METRIC(load_channel_count); - DEREGISTER_HOOK_METRIC(load_mem_consumption); + DEREGISTER_HOOK_METRIC(load_channel_mem_consumption); _stop_background_threads_latch.count_down(); if (_load_channels_clean_thread) { _load_channels_clean_thread->join(); @@ -85,15 +85,18 @@ LoadChannelMgr::~LoadChannelMgr() { Status LoadChannelMgr::init(int64_t process_mem_limit) { int64_t load_mem_limit = calc_process_max_load_memory(process_mem_limit); - _mem_tracker = MemTracker::CreateTracker(load_mem_limit, "LoadChannelMgr", nullptr, true, false, MemTrackerLevel::OVERVIEW); - REGISTER_HOOK_METRIC(load_mem_consumption, [this]() { - return _mem_tracker->consumption(); - }); + _mem_tracker = MemTracker::create_tracker(load_mem_limit, "LoadChannelMgr", + MemTracker::get_process_tracker(), + MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + REGISTER_HOOK_METRIC(load_channel_mem_consumption, [this]() { return _mem_tracker->consumption(); }); + _last_success_channel = new_lru_cache("LastestSuccessChannelCache", 1024); RETURN_IF_ERROR(_start_bg_worker()); return Status::OK(); } Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); UniqueId load_id(params.id()); std::shared_ptr channel; { @@ -112,7 +115,7 @@ Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) { int64_t job_timeout_s = calc_job_timeout_s(timeout_in_req_s); bool is_high_priority = (params.has_is_high_priority() && params.is_high_priority()); - channel.reset(new LoadChannel(load_id, job_max_memory, job_timeout_s, _mem_tracker, is_high_priority, + channel.reset(new LoadChannel(load_id, job_max_memory, job_timeout_s, is_high_priority, params.sender_ip())); _load_channels.insert({load_id, channel}); } @@ -126,6 +129,7 @@ static void dummy_deleter(const CacheKey& key, void* value) {} Status LoadChannelMgr::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); UniqueId load_id(request.id()); // 1. get load channel std::shared_ptr channel; @@ -175,6 +179,7 @@ Status LoadChannelMgr::add_batch(const PTabletWriterAddBatchRequest& request, } void LoadChannelMgr::_handle_mem_exceed_limit() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); // lock so that only one thread can check mem limit std::lock_guard l(_lock); if (!_mem_tracker->limit_exceeded()) { @@ -208,6 +213,7 @@ void LoadChannelMgr::_handle_mem_exceed_limit() { } Status LoadChannelMgr::cancel(const PTabletWriterCancelRequest& params) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); UniqueId load_id(params.id()); std::shared_ptr cancelled_channel; { @@ -248,6 +254,7 @@ Status LoadChannelMgr::_start_bg_worker() { } Status LoadChannelMgr::_start_load_channels_clean() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector> need_delete_channels; LOG(INFO) << "cleaning timed out load channels"; time_t now = time(nullptr); diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp index bcaaa27c448b25..9bf2d74b36af54 100644 --- a/be/src/runtime/mem_pool.cpp +++ b/be/src/runtime/mem_pool.cpp @@ -24,6 +24,7 @@ #include "runtime/mem_tracker.h" #include "runtime/memory/chunk_allocator.h" +#include "runtime/thread_context.h" #include "util/bit_util.h" #include "util/doris_metrics.h" @@ -37,6 +38,31 @@ const int MemPool::MAX_CHUNK_SIZE; const int MemPool::DEFAULT_ALIGNMENT; uint32_t MemPool::k_zero_length_region_ alignas(std::max_align_t) = MEM_POOL_POISON; +MemPool::MemPool(std::shared_ptr mem_tracker) + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + total_reserved_bytes_(0), + peak_allocated_bytes_(0), + _mem_tracker(mem_tracker) {} + +MemPool::MemPool(std::string label) + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + total_reserved_bytes_(0), + peak_allocated_bytes_(0) { + _mem_tracker = MemTracker::create_tracker(-1, label + ":MemPool"); +} + +MemPool::MemPool() + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + total_reserved_bytes_(0), + peak_allocated_bytes_(0), + _mem_tracker(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()) {} + MemPool::ChunkInfo::ChunkInfo(const Chunk& chunk_) : chunk(chunk_), allocated_bytes(0) { DorisMetrics::instance()->memory_pool_bytes_total->increment(chunk.size); } @@ -45,9 +71,8 @@ MemPool::~MemPool() { int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk); + ChunkAllocator::instance()->free(chunk.chunk, _mem_tracker); } - mem_tracker_->Release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } @@ -65,7 +90,7 @@ void MemPool::free_all() { int64_t total_bytes_released = 0; for (auto& chunk : chunks_) { total_bytes_released += chunk.chunk.size; - ChunkAllocator::instance()->free(chunk.chunk); + ChunkAllocator::instance()->free(chunk.chunk, _mem_tracker); } chunks_.clear(); next_chunk_size_ = INITIAL_CHUNK_SIZE; @@ -73,16 +98,15 @@ void MemPool::free_all() { total_allocated_bytes_ = 0; total_reserved_bytes_ = 0; - mem_tracker_->Release(total_bytes_released); DorisMetrics::instance()->memory_pool_bytes_total->increment(-total_bytes_released); } -bool MemPool::find_chunk(size_t min_size, bool check_limits) { +Status MemPool::find_chunk(size_t min_size, bool check_limits) { // Try to allocate from a free chunk. We may have free chunks after the current chunk // if Clear() was called. The current chunk may be free if ReturnPartialAllocation() // was called. The first free chunk (if there is one) can therefore be either the // current chunk or the chunk immediately after the current chunk. - int first_free_idx; + int first_free_idx = 0; if (current_chunk_idx_ == -1) { first_free_idx = 0; } else { @@ -97,7 +121,7 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { if (idx != first_free_idx) std::swap(chunks_[idx], chunks_[first_free_idx]); current_chunk_idx_ = first_free_idx; DCHECK(check_integrity(true)); - return true; + return Status::OK(); } } @@ -115,20 +139,10 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { } chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size); - if (check_limits) { - Status st = mem_tracker_->TryConsume(chunk_size); - WARN_IF_ERROR(st, "try to allocate a new buffer failed"); - if (!st) return false; - } else { - mem_tracker_->Consume(chunk_size); - } // Allocate a new chunk. Return early if allocate fails. Chunk chunk; - if (!ChunkAllocator::instance()->allocate(chunk_size, &chunk)) { - mem_tracker_->Release(chunk_size); - return false; - } + RETURN_IF_ERROR(ChunkAllocator::instance()->allocate(chunk_size, &chunk, _mem_tracker, check_limits)); ASAN_POISON_MEMORY_REGION(chunk.data, chunk_size); // Put it before the first free chunk. If no free chunks, it goes at the end. if (first_free_idx == static_cast(chunks_.size())) { @@ -143,12 +157,12 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) { next_chunk_size_ = static_cast(std::min(chunk_size * 2, MAX_CHUNK_SIZE)); DCHECK(check_integrity(true)); - return true; + return Status::OK(); } void MemPool::acquire_data(MemPool* src, bool keep_current) { DCHECK(src->check_integrity(false)); - int num_acquired_chunks; + int num_acquired_chunks = 0; if (keep_current) { num_acquired_chunks = src->current_chunk_idx_; } else if (src->get_free_offset() == 0) { @@ -172,9 +186,8 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { total_reserved_bytes_ += total_transferred_bytes; // Skip unnecessary atomic ops if the mem_trackers are the same. - if (src->mem_tracker_ != mem_tracker_) { - src->mem_tracker_->Release(total_transferred_bytes); - mem_tracker_->Consume(total_transferred_bytes); + if (src->_mem_tracker != _mem_tracker) { + src->_mem_tracker->transfer_to(_mem_tracker, total_transferred_bytes); } // insert new chunks after current_chunk_idx_ @@ -203,6 +216,7 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) { void MemPool::exchange_data(MemPool* other) { int64_t delta_size = other->total_reserved_bytes_ - total_reserved_bytes_; + other->_mem_tracker->transfer_to(_mem_tracker, delta_size); std::swap(current_chunk_idx_, other->current_chunk_idx_); std::swap(next_chunk_size_, other->next_chunk_size_); @@ -210,10 +224,6 @@ void MemPool::exchange_data(MemPool* other) { std::swap(total_reserved_bytes_, other->total_reserved_bytes_); std::swap(peak_allocated_bytes_, other->peak_allocated_bytes_); std::swap(chunks_, other->chunks_); - - // update MemTracker - mem_tracker_->Consume(delta_size); - other->mem_tracker_->Release(delta_size); } std::string MemPool::debug_string() { diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h index 04d62368315406..87863ebab71c8e 100644 --- a/be/src/runtime/mem_pool.h +++ b/be/src/runtime/mem_pool.h @@ -27,6 +27,7 @@ #include "common/config.h" #include "common/logging.h" +#include "common/status.h" #include "gutil/dynamic_annotations.h" #include "olap/olap_define.h" #include "runtime/memory/chunk.h" @@ -88,16 +89,9 @@ class MemTracker; /// delete p; class MemPool { public: - /// 'tracker' tracks the amount of memory allocated by this pool. Must not be nullptr. - MemPool(MemTracker* mem_tracker) - : current_chunk_idx_(-1), - next_chunk_size_(INITIAL_CHUNK_SIZE), - total_allocated_bytes_(0), - total_reserved_bytes_(0), - peak_allocated_bytes_(0), - mem_tracker_(mem_tracker) { - DCHECK(mem_tracker != nullptr); - } + MemPool(std::shared_ptr mem_tracker); + MemPool(std::string label); + MemPool(); /// Frees all chunks of memory and subtracts the total allocated bytes /// from the registered limits. @@ -106,33 +100,37 @@ class MemPool { /// Allocates a section of memory of 'size' bytes with DEFAULT_ALIGNMENT at the end /// of the the current chunk. Creates a new chunk if there aren't any chunks /// with enough capacity. - uint8_t* allocate(int64_t size) { return allocate(size, DEFAULT_ALIGNMENT); } + uint8_t* allocate(int64_t size, Status* rst = nullptr) { + return allocate(size, DEFAULT_ALIGNMENT, rst); + } /// Same as Allocate() expect add a check when return a nullptr - OLAPStatus allocate_safely(int64_t size, uint8_t*& ret) { - return allocate_safely(size, DEFAULT_ALIGNMENT, ret); + OLAPStatus allocate_safely(int64_t size, uint8_t*& ret, Status* rst = nullptr) { + return allocate_safely(size, DEFAULT_ALIGNMENT, ret, rst); } /// Same as Allocate() except the mem limit is checked before the allocation and /// this call will fail (returns nullptr) if it does. /// The caller must handle the nullptr case. This should be used for allocations /// where the size can be very big to bound the amount by which we exceed mem limits. - uint8_t* try_allocate(int64_t size) { return allocate(size, DEFAULT_ALIGNMENT); } + uint8_t* try_allocate(int64_t size, Status* rst = nullptr) { + return allocate(size, DEFAULT_ALIGNMENT, rst); + } /// Same as TryAllocate() except a non-default alignment can be specified. It /// should be a power-of-two in [1, alignof(std::max_align_t)]. - uint8_t* try_allocate_aligned(int64_t size, int alignment) { + uint8_t* try_allocate_aligned(int64_t size, int alignment, Status* rst = nullptr) { DCHECK_GE(alignment, 1); DCHECK_LE(alignment, config::memory_max_alignment); DCHECK_EQ(BitUtil::RoundUpToPowerOfTwo(alignment), alignment); - return allocate(size, alignment); + return allocate(size, alignment, rst); } /// Same as TryAllocate() except returned memory is not aligned at all. - uint8_t* try_allocate_unaligned(int64_t size) { + uint8_t* try_allocate_unaligned(int64_t size, Status* rst = nullptr) { // Call templated implementation directly so that it is inlined here and the // alignment logic can be optimised out. - return allocate(size, 1); + return allocate(size, 1, rst); } /// Makes all allocated chunks available for re-use, but doesn't delete any chunks. @@ -159,7 +157,7 @@ class MemPool { int64_t total_reserved_bytes() const { return total_reserved_bytes_; } int64_t peak_allocated_bytes() const { return peak_allocated_bytes_; } - MemTracker* mem_tracker() { return mem_tracker_; } + std::shared_ptr mem_tracker() { return _mem_tracker; } static constexpr int DEFAULT_ALIGNMENT = 8; @@ -189,7 +187,7 @@ class MemPool { /// if a new chunk needs to be created. /// If check_limits is true, this call can fail (returns false) if adding a /// new chunk exceeds the mem limits. - bool find_chunk(size_t min_size, bool check_limits); + Status find_chunk(size_t min_size, bool check_limits); /// Check integrity of the supporting data structures; always returns true but DCHECKs /// all invariants. @@ -204,7 +202,7 @@ class MemPool { } template - uint8_t* ALWAYS_INLINE allocate(int64_t size, int alignment) { + uint8_t* ALWAYS_INLINE allocate(int64_t size, int alignment, Status* rst) { DCHECK_GE(size, 0); if (UNLIKELY(size == 0)) return reinterpret_cast(&k_zero_length_region_); @@ -230,7 +228,12 @@ class MemPool { // guarantee alignment. //static_assert( //INITIAL_CHUNK_SIZE >= config::FLAGS_MEMORY_MAX_ALIGNMENT, "Min chunk size too low"); - if (UNLIKELY(!find_chunk(size, CHECK_LIMIT_FIRST))) return nullptr; + if (rst == nullptr) { + if (UNLIKELY(!find_chunk(size, CHECK_LIMIT_FIRST))) return nullptr; + } else { + *rst = find_chunk(size, CHECK_LIMIT_FIRST); + if (UNLIKELY(!*rst)) return nullptr; + } ChunkInfo& info = chunks_[current_chunk_idx_]; uint8_t* result = info.chunk.data + info.allocated_bytes; @@ -244,8 +247,9 @@ class MemPool { } template - OLAPStatus ALWAYS_INLINE allocate_safely(int64_t size, int alignment, uint8_t*& ret) { - uint8_t* result = allocate(size, alignment); + OLAPStatus ALWAYS_INLINE allocate_safely(int64_t size, int alignment, uint8_t*& ret, + Status* rst = nullptr) { + uint8_t* result = allocate(size, alignment, rst); if (result == nullptr) { return OLAP_ERR_MALLOC_ERROR; } @@ -278,12 +282,12 @@ class MemPool { /// The current and peak memory footprint of this pool. This is different from /// total allocated_bytes_ since it includes bytes in chunks that are not used. - MemTracker* mem_tracker_; + std::shared_ptr _mem_tracker; }; // Stamp out templated implementations here so they're included in IR module -template uint8_t* MemPool::allocate(int64_t size, int alignment); -template uint8_t* MemPool::allocate(int64_t size, int alignment); +template uint8_t* MemPool::allocate(int64_t size, int alignment, Status* rst); +template uint8_t* MemPool::allocate(int64_t size, int alignment, Status* rst); } // namespace doris #endif diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index 350f7bc3119668..a6695e4b18b97c 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -17,319 +17,175 @@ #include "runtime/mem_tracker.h" -#include +#include -#include -#include - -#include #include #include "exec/exec_node.h" #include "gutil/once.h" -#include "gutil/strings/substitute.h" -#include "runtime/bufferpool/reservation_tracker_counters.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" -#include "util/debug_util.h" -#include "util/doris_metrics.h" -#include "util/mem_info.h" #include "util/pretty_printer.h" -#include "util/stack_util.h" +#include "util/string_util.h" #include "util/uid_util.h" -using boost::join; -using std::deque; -using std::endl; -using std::greater; -using std::list; -using std::pair; -using std::priority_queue; -using std::shared_ptr; -using std::string; - -using std::vector; -using std::weak_ptr; -using strings::Substitute; - namespace doris { const std::string MemTracker::COUNTER_NAME = "PeakMemoryUsage"; -// Name for request pool MemTrackers. '$0' is replaced with the pool name. -const std::string REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT = "RequestPool=$0"; - -/// Calculate the soft limit for a MemTracker based on the hard limit 'limit'. -static int64_t CalcSoftLimit(int64_t limit) { - if (limit < 0) return -1; - double frac = std::max(0.0, std::min(1.0, config::soft_mem_limit_frac)); - return static_cast(limit * frac); +// The ancestor for all trackers. Every tracker is visible from the process down. +// The consume/release of child tracker will not be synchronized to process tracker. +// It is used to independently statistics the real memory of the process in TCMalloc New/Delete Hook. +static std::shared_ptr process_tracker; +static MemTracker* raw_process_tracker; +static GoogleOnceType process_tracker_once = GOOGLE_ONCE_INIT; + +void MemTracker::create_process_tracker() { + process_tracker.reset(new MemTracker(-1, "Process", nullptr, MemTrackerLevel::OVERVIEW, nullptr)); + process_tracker->init(); + raw_process_tracker = process_tracker.get(); } -// The ancestor for all trackers. Every tracker is visible from the root down. -static std::shared_ptr root_tracker; -static GoogleOnceType root_tracker_once = GOOGLE_ONCE_INIT; +std::shared_ptr MemTracker::get_process_tracker() { + GoogleOnceInit(&process_tracker_once, &MemTracker::create_process_tracker); + return process_tracker; +} -void MemTracker::CreateRootTracker() { - root_tracker.reset(new MemTracker(nullptr, -1, "Root", nullptr, true, MemTrackerLevel::OVERVIEW)); - root_tracker->Init(); +MemTracker* MemTracker::get_raw_process_tracker() { + GoogleOnceInit(&process_tracker_once, &MemTracker::create_process_tracker); + return raw_process_tracker; } -std::shared_ptr MemTracker::CreateTracker(RuntimeProfile* profile, int64_t byte_limit, - const std::string& label, const std::shared_ptr& parent, - bool reset_label_name, MemTrackerLevel level) { - std::shared_ptr real_parent; - std::string label_name; - // if parent is not null, reset label name to query id. - // The parent label always: RuntimeState:instance:8ca5a59e3aa84f74-84bb0d0466193736 - // we just need the last id of it: 8ca5a59e3aa84f74-84bb0d0466193736 - // to build the new label name of tracker: `label`: 8ca5a59e3aa84f74-84bb0d0466193736 - // else if parent is null - // just use the root is parent and keep the label_name as label - if (parent) { - real_parent = parent; - if (reset_label_name) { - std::vector tmp_result; - boost::split(tmp_result, parent->label(), boost::is_any_of(":")); - label_name = label + ":" + tmp_result[tmp_result.size() - 1]; - } else { - label_name = label; +void MemTracker::list_process_trackers(std::vector>* trackers) { + trackers->clear(); + std::deque> to_process; + to_process.push_front(get_process_tracker()); + while (!to_process.empty()) { + std::shared_ptr t = to_process.back(); + to_process.pop_back(); + + trackers->push_back(t); + std::list> children; + { + lock_guard l(t->_child_trackers_lock); + children = t->_child_trackers; + } + for (const auto& child_weak : children) { + std::shared_ptr child = child_weak.lock(); + if (child && static_cast(child->_level) <= + config::mem_tracker_level) { + to_process.emplace_back(std::move(child)); + } } - } else { - real_parent = GetRootTracker(); - label_name = label; } +} - shared_ptr tracker(new MemTracker(profile, byte_limit, label_name, real_parent, true, - level > real_parent->_level ? level : real_parent->_level)); - real_parent->AddChildTracker(tracker); - tracker->Init(); - +std::shared_ptr MemTracker::create_tracker(int64_t byte_limit, const std::string& label, + const std::shared_ptr& parent, + MemTrackerLevel level, + RuntimeProfile* profile) { + std::shared_ptr reset_parent = parent ? parent : thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); + DCHECK(reset_parent); + + std::shared_ptr tracker(new MemTracker( + byte_limit, label, + reset_parent, level > reset_parent->_level ? level : reset_parent->_level, profile)); + reset_parent->add_child_tracker(tracker); + tracker->init(); return tracker; } -std::shared_ptr MemTracker::CreateTracker(int64_t byte_limit, const std::string& label, - std::shared_ptr parent, bool log_usage_if_zero, bool reset_label_name, MemTrackerLevel level) { - std::shared_ptr real_parent; - std::string label_name; - // if parent is not null, reset label name to query id. - // The parent label always: RuntimeState:instance:8ca5a59e3aa84f74-84bb0d0466193736 - // we just need the last id of it: 8ca5a59e3aa84f74-84bb0d0466193736 - // to build the new label name of tracker: `label`: 8ca5a59e3aa84f74-84bb0d0466193736 - // else if parent is null - // just use the root is parent and keep the label_name as label - if (parent) { - real_parent = parent; - if (reset_label_name) { - std::vector tmp_result; - boost::split(tmp_result, parent->label(), boost::is_any_of(":")); - label_name = label + ":" + tmp_result[tmp_result.size() - 1]; - } else { - label_name = label; - } - } else { - real_parent = GetRootTracker(); - label_name = label; - } - - shared_ptr tracker( - new MemTracker(nullptr, byte_limit, label_name, real_parent, log_usage_if_zero, - level > real_parent->_level ? level : real_parent->_level)); - real_parent->AddChildTracker(tracker); - tracker->Init(); +std::shared_ptr MemTracker::create_virtual_tracker( + int64_t byte_limit, const std::string& label, const std::shared_ptr& parent, + MemTrackerLevel level) { + std::shared_ptr reset_parent = parent ? parent : thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); + DCHECK(reset_parent); + std::shared_ptr tracker(new MemTracker( + byte_limit, "[Virtual]-" + label, reset_parent, level, nullptr)); + reset_parent->add_child_tracker(tracker); + tracker->init_virtual(); return tracker; } MemTracker::MemTracker(int64_t byte_limit, const std::string& label) - : MemTracker(nullptr, byte_limit, label, std::shared_ptr(), true, MemTrackerLevel::VERBOSE) {} - -MemTracker::MemTracker(RuntimeProfile* profile, int64_t byte_limit, const string& label, - const std::shared_ptr& parent, bool log_usage_if_zero, MemTrackerLevel level) - : limit_(byte_limit), - soft_limit_(CalcSoftLimit(byte_limit)), - label_(label), - parent_(parent), - consumption_metric_(nullptr), - log_usage_if_zero_(log_usage_if_zero), - _level(level), - num_gcs_metric_(nullptr), - bytes_freed_by_last_gc_metric_(nullptr), - bytes_over_limit_metric_(nullptr), - limit_metric_(nullptr) { + : MemTracker(byte_limit, label, std::shared_ptr(), MemTrackerLevel::VERBOSE, + nullptr) {} + +MemTracker::MemTracker(int64_t byte_limit, const std::string& label, + const std::shared_ptr& parent, MemTrackerLevel level, + RuntimeProfile* profile) + : _limit(byte_limit), _label(label), _id(_label + std::to_string(GetCurrentTimeMicros()) + std::to_string(rand())), _parent(parent), _level(level) { if (profile == nullptr) { - consumption_ = std::make_shared(TUnit::BYTES); + _consumption = std::make_shared(TUnit::BYTES); } else { - consumption_ = profile->AddSharedHighWaterMarkCounter(COUNTER_NAME, TUnit::BYTES); + _consumption = profile->AddSharedHighWaterMarkCounter(COUNTER_NAME, TUnit::BYTES); } } -void MemTracker::Init() { - DCHECK_GE(limit_, -1); - DCHECK_LE(soft_limit_, limit_); - // populate all_trackers_ and limit_trackers_ +void MemTracker::init() { + DCHECK_GE(_limit, -1); MemTracker* tracker = this; - while (tracker != nullptr) { - all_trackers_.push_back(tracker); - if (tracker->has_limit()) limit_trackers_.push_back(tracker); - tracker = tracker->parent_.get(); + while (tracker != nullptr && tracker->_virtual == false) { + _all_trackers.push_back(tracker); + if (tracker->has_limit()) _limit_trackers.push_back(tracker); + tracker = tracker->_parent.get(); } - DCHECK_GT(all_trackers_.size(), 0); - DCHECK_EQ(all_trackers_[0], this); + DCHECK_GT(_all_trackers.size(), 0); + DCHECK_EQ(_all_trackers[0], this); } -void MemTracker::AddChildTracker(const std::shared_ptr& tracker) { - lock_guard l(child_trackers_lock_); - tracker->child_tracker_it_ = child_trackers_.insert(child_trackers_.end(), tracker); -} - -void MemTracker::EnableReservationReporting(const ReservationTrackerCounters& counters) { - delete reservation_counters_.swap(new ReservationTrackerCounters(counters)); -} - -int64_t MemTracker::GetLowestLimit(MemLimit mode) const { - if (limit_trackers_.empty()) return -1; - int64_t min_limit = numeric_limits::max(); - for (MemTracker* limit_tracker : limit_trackers_) { - DCHECK(limit_tracker->has_limit()); - min_limit = std::min(min_limit, limit_tracker->GetLimit(mode)); - } - return min_limit; -} - -int64_t MemTracker::SpareCapacity(MemLimit mode) const { - int64_t result = std::numeric_limits::max(); - for (const auto& tracker : limit_trackers_) { - int64_t mem_left = tracker->GetLimit(mode) - tracker->consumption(); - result = std::min(result, mem_left); - } - return result; -} - -void MemTracker::RefreshConsumptionFromMetric() { - DCHECK(consumption_metric_ != nullptr); - consumption_->set(consumption_metric_->value()); -} - -int64_t MemTracker::GetPoolMemReserved() { - // Pool trackers should have a pool_name_ and no limit. - DCHECK(!pool_name_.empty()); - DCHECK_EQ(limit_, -1) << LogUsage(UNLIMITED_DEPTH); - - // Use cache to avoid holding child_trackers_lock_ - list> children; - { - lock_guard l(child_trackers_lock_); - children = child_trackers_; - } - - int64_t mem_reserved = 0L; - for (const auto& child_weak : children) { - std::shared_ptr child = child_weak.lock(); - if (child) { - int64_t child_limit = child->limit(); - if (child_limit > 0) { - // Make sure we don't overflow if the query limits are set to ridiculous values. - mem_reserved += std::min(child_limit, MemInfo::physical_mem()); - } else { - DCHECK(child_limit == -1) - << child->LogUsage(UNLIMITED_DEPTH); - mem_reserved += child->consumption(); - } - } - } - return mem_reserved; -} - -std::shared_ptr PoolMemTrackerRegistry::GetRequestPoolMemTracker( - const string& pool_name, bool create_if_not_present) { - DCHECK(!pool_name.empty()); - lock_guard l(pool_to_mem_trackers_lock_); - PoolTrackersMap::iterator it = pool_to_mem_trackers_.find(pool_name); - if (it != pool_to_mem_trackers_.end()) { - MemTracker* tracker = it->second.get(); - DCHECK(pool_name == tracker->pool_name_); - return it->second; - } - if (!create_if_not_present) return nullptr; - // First time this pool_name registered, make a new object. - std::shared_ptr tracker = MemTracker::CreateTracker( - -1, strings::Substitute(REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT, pool_name), - ExecEnv::GetInstance()->process_mem_tracker()); - tracker->pool_name_ = pool_name; - pool_to_mem_trackers_.emplace(pool_name, std::shared_ptr(tracker)); - return tracker; +void MemTracker::init_virtual() { + DCHECK_GE(_limit, -1); + _all_trackers.push_back(this); + if (this->has_limit()) _limit_trackers.push_back(this); + _virtual = true; } MemTracker::~MemTracker() { - delete reservation_counters_.load(); - - if (parent()) { - DCHECK(consumption() == 0) << "Memory tracker " << debug_string() - << " has unreleased consumption " << consumption(); - parent_->Release(consumption()); - - lock_guard l(parent_->child_trackers_lock_); - if (child_tracker_it_ != parent_->child_trackers_.end()) { - parent_->child_trackers_.erase(child_tracker_it_); - child_tracker_it_ = parent_->child_trackers_.end(); + // TCMalloc hook will be triggered during destructor memtracker, may cause crash. + if (_label == "Process") GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER(); + if (!_virtual && parent()) { + if (consumption() != 0) { + memory_leak_check(this); + // At present, it can only guarantee the accurate recording of the Instance tracker, + // lower layer has the problem of repeated release of different trackers, as explained above. + if (_level <= MemTrackerLevel::INSTANCE) { + // _parent->release(consumption()); + } } - } -} - -void MemTracker::ListTrackers(vector>* trackers) { - trackers->clear(); - deque> to_process; - to_process.push_front(GetRootTracker()); - while (!to_process.empty()) { - shared_ptr t = to_process.back(); - to_process.pop_back(); - trackers->push_back(t); - list> children; - { - lock_guard l(t->child_trackers_lock_); - children = t->child_trackers_; - } - for (const auto& child_weak : children) { - shared_ptr child = child_weak.lock(); - if (child && static_cast(child->_level) <= config::mem_tracker_level) { - to_process.emplace_back(std::move(child)); - } + // Do not call release on the parent tracker to avoid repeated releases. + // Ensure that all consume/release are triggered by TCMalloc new/delete hook. + lock_guard l(_parent->_child_trackers_lock); + if (_child_tracker_it != _parent->_child_trackers.end()) { + _parent->_child_trackers.erase(_child_tracker_it); + _child_tracker_it = _parent->_child_trackers.end(); } } + consume(_untracked_mem); } -//void MemTracker::RegisterMetrics(MetricGroup* metrics, const string& prefix) { -// num_gcs_metric_ = metrics->AddCounter(strings::Substitute("$0.num-gcs", prefix), 0); -// -// // TODO: Consider a total amount of bytes freed counter -// bytes_freed_by_last_gc_metric_ = metrics->AddGauge( -// strings::Substitute("$0.bytes-freed-by-last-gc", prefix), -1); -// -// bytes_over_limit_metric_ = metrics->AddGauge( -// strings::Substitute("$0.bytes-over-limit", prefix), -1); -// -// limit_metric_ = metrics->AddGauge(strings::Substitute("$0.limit", prefix), limit_); -//} - -void MemTracker::TransferTo(MemTracker* dst, int64_t bytes) { - DCHECK_EQ(all_trackers_.back(), dst->all_trackers_.back()) << "Must have same root"; +void MemTracker::transfer_to_relative(const std::shared_ptr& dst, int64_t bytes) { + DCHECK_EQ(_all_trackers.back(), dst->_all_trackers.back()) << "Must have same ancestor"; + DCHECK(!dst->has_limit()); // Find the common ancestor and update trackers between 'this'/'dst' and // the common ancestor. This logic handles all cases, including the // two trackers being the same or being ancestors of each other because // 'all_trackers_' includes the current tracker. - int ancestor_idx = all_trackers_.size() - 1; - int dst_ancestor_idx = dst->all_trackers_.size() - 1; + int ancestor_idx = _all_trackers.size() - 1; + int dst_ancestor_idx = dst->_all_trackers.size() - 1; while (ancestor_idx > 0 && dst_ancestor_idx > 0 && - all_trackers_[ancestor_idx - 1] == dst->all_trackers_[dst_ancestor_idx - 1]) { + _all_trackers[ancestor_idx - 1] == dst->_all_trackers[dst_ancestor_idx - 1]) { + DCHECK(!dst->_all_trackers[dst_ancestor_idx - 1]->has_limit()); --ancestor_idx; --dst_ancestor_idx; } - MemTracker* common_ancestor = all_trackers_[ancestor_idx]; - ReleaseLocal(bytes, common_ancestor); - dst->ConsumeLocal(bytes, common_ancestor); + MemTracker* common_ancestor = _all_trackers[ancestor_idx]; + release(bytes, common_ancestor); + dst->consume(bytes, common_ancestor); } // Calling this on the query tracker results in output like: @@ -353,83 +209,48 @@ void MemTracker::TransferTo(MemTracker* dst, int64_t bytes) { // TrackerName: Limit=5.00 MB Reservation=5.00 MB OtherMemory=1.04 MB // Total=6.04 MB Peak=6.45 MB // -std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, - int64_t* logged_consumption) { +std::string MemTracker::log_usage(int max_recursive_depth, int64_t* logged_consumption) { // Make sure the consumption is up to date. - if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric(); int64_t curr_consumption = consumption(); - int64_t peak_consumption = consumption_->value(); + int64_t peak_consumption = _consumption->value(); if (logged_consumption != nullptr) *logged_consumption = curr_consumption; - if (!log_usage_if_zero_ && curr_consumption == 0) return ""; - - std::stringstream ss; - ss << prefix << label_ << ":"; - if (CheckLimitExceeded(MemLimit::HARD)) ss << " memory limit exceeded."; - if (limit_ > 0) ss << " Limit=" << PrettyPrinter::print(limit_, TUnit::BYTES); - - // TODO(zxy): ReservationTrackerCounters is not actually used in the current Doris. - // Printing here ReservationTrackerCounters may cause BE crash when high concurrency. - // The memory tracker in Doris will be redesigned in the future. - // ReservationTrackerCounters* reservation_counters = reservation_counters_.load(); - // if (reservation_counters != nullptr) { - // int64_t reservation = reservation_counters->peak_reservation->current_value(); - // ss << " Reservation=" << PrettyPrinter::print(reservation, TUnit::BYTES); - // if (reservation_counters->reservation_limit != nullptr) { - // int64_t limit = reservation_counters->reservation_limit->value(); - // ss << " ReservationLimit=" << PrettyPrinter::print(limit, TUnit::BYTES); - // } - // ss << " OtherMemory=" << PrettyPrinter::print(curr_consumption - reservation, TUnit::BYTES); - // } - ss << " Total=" << PrettyPrinter::print(curr_consumption, TUnit::BYTES); - // Peak consumption is not accurate if the metric is lazily updated (i.e. - // this is a non-root tracker that exists only for reporting purposes). - // Only report peak consumption if we actually call Consume()/Release() on - // this tracker or an descendent. - if (consumption_metric_ == nullptr || parent_ == nullptr) { - ss << " Peak=" << PrettyPrinter::print(peak_consumption, TUnit::BYTES); - } + if (_level > MemTrackerLevel::INSTANCE && curr_consumption == 0) return ""; + + std::string detail = + "MemTracker log_usage Label: {}, Limit: {}, Total: {}, Peak: {}, Exceeded: {}"; + detail = fmt::format(detail, _label, PrettyPrinter::print(_limit, TUnit::BYTES), + PrettyPrinter::print(curr_consumption, TUnit::BYTES), + PrettyPrinter::print(peak_consumption, TUnit::BYTES), + limit_exceeded() ? "true" : "false"); // This call does not need the children, so return early. - if (max_recursive_depth == 0) return ss.str(); + if (max_recursive_depth == 0) return detail; // Recurse and get information about the children - std::string new_prefix = strings::Substitute(" $0", prefix); int64_t child_consumption; std::string child_trackers_usage; - list> children; + std::list> children; { - lock_guard l(child_trackers_lock_); - children = child_trackers_; - } - child_trackers_usage = - LogUsage(max_recursive_depth - 1, new_prefix, children, &child_consumption); - if (!child_trackers_usage.empty()) ss << "\n" << child_trackers_usage; - - if (parent_ == nullptr) { - // Log the difference between the metric value and children as "untracked" memory so - // that the values always add up. This value is not always completely accurate because - // we did not necessarily get a consistent snapshot of the consumption values for all - // children at a single moment in time, but is good enough for our purposes. - int64_t untracked_bytes = curr_consumption - child_consumption; - ss << "\n" - << new_prefix - << "Untracked Memory: Total=" << PrettyPrinter::print(untracked_bytes, TUnit::BYTES); + lock_guard l(_child_trackers_lock); + children = _child_trackers; } - return ss.str(); + child_trackers_usage = log_usage(max_recursive_depth - 1, children, &child_consumption); + if (!child_trackers_usage.empty()) detail += "\n" + child_trackers_usage; + return detail; } -std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, - const list>& trackers, - int64_t* logged_consumption) { +std::string MemTracker::log_usage(int max_recursive_depth, + const std::list>& trackers, + int64_t* logged_consumption) { *logged_consumption = 0; - std::vector usage_strings; + std::vector usage_strings; for (const auto& tracker_weak : trackers) { - shared_ptr tracker = tracker_weak.lock(); + std::shared_ptr tracker = tracker_weak.lock(); if (tracker) { int64_t tracker_consumption; std::string usage_string = - tracker->LogUsage(max_recursive_depth, prefix, &tracker_consumption); + tracker->log_usage(max_recursive_depth, &tracker_consumption); if (!usage_string.empty()) usage_strings.push_back(usage_string); *logged_consumption += tracker_consumption; } @@ -437,136 +258,62 @@ std::string MemTracker::LogUsage(int max_recursive_depth, const string& prefix, return join(usage_strings, "\n"); } -std::string MemTracker::LogTopNQueries(int limit) { - if (limit == 0) return ""; - priority_queue, std::vector>, - std::greater>> - min_pq; - GetTopNQueries(min_pq, limit); - std::vector usage_strings(min_pq.size()); - while (!min_pq.empty()) { - usage_strings.push_back(min_pq.top().second); - min_pq.pop(); - } - std::reverse(usage_strings.begin(), usage_strings.end()); - return join(usage_strings, "\n"); -} - -void MemTracker::GetTopNQueries( - priority_queue, std::vector>, - greater>>& min_pq, - int limit) { - list> children; - { - lock_guard l(child_trackers_lock_); - children = child_trackers_; +Status MemTracker::mem_limit_exceeded(RuntimeState* state, const std::string& details, + int64_t failed_allocation_size, Status failed_alloc) { + MemTracker* process_tracker = MemTracker::get_raw_process_tracker(); + std::string detail = + "Memory exceed limit. fragment={}, details={}, on backend={}. Memory left in process " + "limit={}."; + detail = fmt::format(detail, state != nullptr ? print_id(state->fragment_instance_id()) : "", + details, BackendOptions::get_localhost(), + PrettyPrinter::print(process_tracker->spare_capacity(), TUnit::BYTES)); + if (!failed_alloc) { + detail += " failed alloc=<{}>. current tracker={}."; + detail = fmt::format(detail, failed_alloc.to_string(), _label); + } else { + detail += " current tracker ."; + detail = fmt::format(detail, _label, _consumption->current_value(), _limit, + PrettyPrinter::print(failed_allocation_size, TUnit::BYTES)); } - for (const auto& child_weak : children) { - shared_ptr child = child_weak.lock(); - if (child) { - child->GetTopNQueries(min_pq, limit); - } + detail += " If query, can change the limit by session variable exec_mem_limit."; + Status status = Status::MemoryLimitExceeded(detail); + if (state != nullptr) state->log_error(detail); + + // only print the tracker log_usage in be log. + if (process_tracker->spare_capacity() < failed_allocation_size) { + // Dumping the process MemTracker is expensive. Limiting the recursive depth to two + // levels limits the level of detail to a one-line summary for each query MemTracker. + detail += "\n" + process_tracker->log_usage(2); } -} - -MemTracker* MemTracker::GetQueryMemTracker() { - MemTracker* tracker = this; - while (tracker != nullptr) { - tracker = tracker->parent_.get(); + if (parent_task_mem_tracker() != nullptr) { + detail += "\n" + parent_task_mem_tracker()->log_usage(); } - return tracker; -} + LOG(WARNING) << detail; -Status MemTracker::MemLimitExceeded(MemTracker* mtracker, RuntimeState* state, - const std::string& details, int64_t failed_allocation_size) { - DCHECK_GE(failed_allocation_size, 0); - std::stringstream ss; - if (!details.empty()) ss << details << std::endl; - if (failed_allocation_size != 0) { - if (mtracker != nullptr) ss << mtracker->label(); - ss << " could not allocate " << PrettyPrinter::print(failed_allocation_size, TUnit::BYTES) - << " without exceeding limit." << std::endl; - } - ss << "Error occurred on backend " << BackendOptions::get_localhost(); - if (state != nullptr) ss << " by fragment " << print_id(state->fragment_instance_id()); - ss << std::endl; - ExecEnv* exec_env = ExecEnv::GetInstance(); - MemTracker* process_tracker = exec_env->process_mem_tracker().get(); - const int64_t process_capacity = process_tracker->SpareCapacity(MemLimit::HARD); - ss << "Memory left in process limit: " << PrettyPrinter::print(process_capacity, TUnit::BYTES) - << std::endl; - Status status = Status::MemoryLimitExceeded(ss.str()); - - // only print the query tracker in be log(if available). - MemTracker* query_tracker = nullptr; - if (mtracker != nullptr) { - query_tracker = mtracker->GetQueryMemTracker(); - if (query_tracker != nullptr) { - if (query_tracker->has_limit()) { - const int64_t query_capacity = - query_tracker->limit() - query_tracker->consumption(); - ss << "Memory left in query limit: " - << PrettyPrinter::print(query_capacity, TUnit::BYTES) << std::endl; - } - ss << query_tracker->LogUsage(UNLIMITED_DEPTH); - } - } - - // Log the process level if the process tracker is close to the limit or - // if this tracker is not within a query's MemTracker hierarchy. - if (process_capacity < failed_allocation_size || query_tracker == nullptr) { - // IMPALA-5598: For performance reasons, limit the levels of recursion when - // dumping the process tracker to only two layers. - ss << process_tracker->LogUsage(PROCESS_MEMTRACKER_LIMITED_DEPTH); - } - if (state != nullptr) state->log_error(ss.str()); - LOG(WARNING) << ss.str(); return status; } -void MemTracker::AddGcFunction(GcFunction f) { - gc_functions_.push_back(f); -} - -bool MemTracker::LimitExceededSlow(MemLimit mode) { - if (mode == MemLimit::HARD && bytes_over_limit_metric_ != nullptr) { - bytes_over_limit_metric_->set_value(consumption() - limit_); - } - return GcMemory(GetLimit(mode)); -} - -bool MemTracker::GcMemory(int64_t max_consumption) { +bool MemTracker::gc_memory(int64_t max_consumption) { if (max_consumption < 0) return true; - lock_guard l(gc_lock_); - if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric(); + lock_guard l(_gc_lock); int64_t pre_gc_consumption = consumption(); // Check if someone gc'd before us if (pre_gc_consumption < max_consumption) return false; - if (num_gcs_metric_ != nullptr) num_gcs_metric_->increment(1); int64_t curr_consumption = pre_gc_consumption; + const int64_t EXTRA_BYTES_TO_FREE = 4L * 1024L * 1024L * 1024L; // TODO(zxy) Consider as config // Try to free up some memory - for (int i = 0; i < gc_functions_.size(); ++i) { + for (int i = 0; i < _gc_functions.size(); ++i) { // Try to free up the amount we are over plus some extra so that we don't have to // immediately GC again. Don't free all the memory since that can be unnecessarily // expensive. - const int64_t EXTRA_BYTES_TO_FREE = 512L * 1024L * 1024L; int64_t bytes_to_free = curr_consumption - max_consumption + EXTRA_BYTES_TO_FREE; - gc_functions_[i](bytes_to_free); - if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric(); + _gc_functions[i](bytes_to_free); curr_consumption = consumption(); if (max_consumption - curr_consumption <= EXTRA_BYTES_TO_FREE) break; } - if (bytes_freed_by_last_gc_metric_ != nullptr) { - bytes_freed_by_last_gc_metric_->set_value(pre_gc_consumption - curr_consumption); - } return curr_consumption > max_consumption; } -std::shared_ptr MemTracker::GetRootTracker() { - GoogleOnceInit(&root_tracker_once, &MemTracker::CreateRootTracker); - return root_tracker; -} - } // namespace doris diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h index 1622a70e71adba..22309518091192 100644 --- a/be/src/runtime/mem_tracker.h +++ b/be/src/runtime/mem_tracker.h @@ -18,49 +18,29 @@ #pragma once #include -#include #include #include -#include -#include -#include -#include -#include -#include +#include "common/config.h" #include "common/status.h" -#include "gen_cpp/Types_types.h" // for TUniqueId #include "util/mem_info.h" -#include "util/metrics.h" #include "util/runtime_profile.h" #include "util/spinlock.h" namespace doris { -/// Mode argument passed to various MemTracker methods to indicate whether a soft or hard -/// limit should be used. -enum class MemLimit { HARD, SOFT }; +// The Level use to decide whether to show it in web page, +// each MemTracker have a Level less than or equal to parent, only be set explicit, +// TASK contains query, import, compaction, etc. +enum class MemTrackerLevel { OVERVIEW = 0, TASK, INSTANCE, VERBOSE }; -/// The Level use to decide whether to show it in web page -/// each MemTracker have a Level equals to parent, only be set explicit -enum class MemTrackerLevel { OVERVIEW = 0, TASK, VERBOSE }; - -class ObjectPool; class MemTracker; -struct ReservationTrackerCounters; class RuntimeState; -class TQueryOptions; /// A MemTracker tracks memory consumption; it contains an optional limit /// and can be arranged into a tree structure such that the consumption tracked /// by a MemTracker is also tracked by its ancestors. /// -/// A MemTracker has a hard and a soft limit derived from the limit. If the hard limit -/// is exceeded, all memory allocations and queries should fail until we are under the -/// limit again. The soft limit can be exceeded without causing query failures, but -/// consumers of memory that can tolerate running without more memory should not allocate -/// memory in excess of the soft limit. -/// /// We use a five-level hierarchy of mem trackers: process, pool, query, fragment /// instance. Specific parts of the fragment (exec nodes, sinks, etc) will add a /// fifth level when they are initialized. This function also initializes a user @@ -77,7 +57,7 @@ class TQueryOptions; /// Release(). /// /// GcFunctions can be attached to a MemTracker in order to free up memory if the limit is -/// reached. If LimitExceeded() is called and the limit is exceeded, it will first call +/// reached. If limit_exceeded() is called and the limit is exceeded, it will first call /// the GcFunctions to try to free memory and recheck the limit. For example, the process /// tracker has a GcFunction that releases any unused memory still held by tcmalloc, so /// this will be called before the process limit is reported as exceeded. GcFunctions are @@ -86,130 +66,91 @@ class TQueryOptions; /// call back into MemTrackers, except to release memory. // /// This class is thread-safe. -class MemTracker : public std::enable_shared_from_this { +class MemTracker { public: - // Creates and adds the tracker to the tree so that it can be retrieved with - // FindTracker/FindOrCreateTracker. - static std::shared_ptr CreateTracker( + // Creates and adds the tracker to the tree + static std::shared_ptr create_tracker( int64_t byte_limit = -1, const std::string& label = std::string(), - std::shared_ptr parent = std::shared_ptr(), - bool log_usage_if_zero = true, bool reset_label_name = true, - MemTrackerLevel level = MemTrackerLevel::VERBOSE); + const std::shared_ptr& parent = std::shared_ptr(), + MemTrackerLevel level = MemTrackerLevel::VERBOSE, RuntimeProfile* profile = nullptr); - static std::shared_ptr CreateTracker( - RuntimeProfile* profile, int64_t byte_limit, const std::string& label = std::string(), + // Cosume/release will not sync to parent.Usually used to manually record the specified memory, + // It is independent of the recording of TCMalloc Hook in the thread local tracker, so the same + // block of memory is recorded independently in these two trackers. + static std::shared_ptr create_virtual_tracker( + int64_t byte_limit = -1, const std::string& label = std::string(), const std::shared_ptr& parent = std::shared_ptr(), - bool reset_label_name = true, MemTrackerLevel level = MemTrackerLevel::VERBOSE); + MemTrackerLevel level = MemTrackerLevel::VERBOSE); // this is used for creating an orphan mem tracker, or for unit test. - // If a mem tracker has parent, it should be created by `CreateTracker()` + // If a mem tracker has parent, it should be created by `create_tracker()` MemTracker(int64_t byte_limit = -1, const std::string& label = std::string()); ~MemTracker(); // Returns a list of all the valid trackers. - static void ListTrackers(std::vector>* trackers); - - /// Include counters from a ReservationTracker in logs and other diagnostics. - /// The counters should be owned by the fragment's RuntimeProfile. - void EnableReservationReporting(const ReservationTrackerCounters& counters); + static void list_process_trackers(std::vector>* trackers); - // Gets a shared_ptr to the "root" tracker, creating it if necessary. - static std::shared_ptr GetRootTracker(); + // Gets a shared_ptr to the "process" tracker, creating it if necessary. + static std::shared_ptr get_process_tracker(); + static MemTracker* get_raw_process_tracker(); - // delete static CreateQueryMemTracker(), cuz it cannot use shared tracker - - /// Increases consumption of this tracker and its ancestors by 'bytes'. - void Consume(int64_t bytes) { - // DCHECK_GE(bytes, 0); - if (bytes < 0) { - Release(-bytes); - return; - } - if (bytes == 0) { - return; + inline Status check_sys_mem_info(int64_t bytes) { + if (MemInfo::initialized() && MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { + return Status::MemoryLimitExceeded(fmt::format( + "{}: TryConsume failed, bytes={} process whole consumption={} mem limit={}", + _label, bytes, MemInfo::current_mem(), MemInfo::mem_limit())); } + return Status::OK(); + } - if (UNLIKELY(consumption_metric_ != nullptr)) { - RefreshConsumptionFromMetric(); - return; // TODO(yingchun): why return not update tracker? + // Increases consumption of this tracker and its ancestors by 'bytes'. + // up to (but not including) end_tracker. + // This is useful if we want to move tracking between trackers that share a common (i.e. end_tracker) + // ancestor. This happens when we want to update tracking on a particular mem tracker but the consumption + // against the limit recorded in one of its ancestors already happened. + void consume(int64_t bytes, MemTracker* end_tracker = nullptr) { + if (bytes <= 0) { + release(-bytes, end_tracker); + return; } - for (auto& tracker : all_trackers_) { - tracker->consumption_->add(bytes); - if (LIKELY(tracker->consumption_metric_ == nullptr)) { - DCHECK_GE(tracker->consumption_->current_value(), 0); - } + for (auto& tracker : _all_trackers) { + if (tracker == end_tracker) return; + tracker->_consumption->add(bytes); + memory_leak_check(tracker); } } - /// Increases the consumption of this tracker and the ancestors up to (but - /// not including) end_tracker. This is useful if we want to move tracking between - /// trackers that share a common (i.e. end_tracker) ancestor. This happens when we want - /// to update tracking on a particular mem tracker but the consumption against - /// the limit recorded in one of its ancestors already happened. - void ConsumeLocal(int64_t bytes, MemTracker* end_tracker) { - DCHECK_GE(bytes, 0); - if (UNLIKELY(bytes < 0)) return; // needed in RELEASE, hits DCHECK in DEBUG - ChangeConsumption(bytes, end_tracker); - } - - /// Same as above, but it decreases the consumption. - void ReleaseLocal(int64_t bytes, MemTracker* end_tracker) { - DCHECK_GE(bytes, 0); - if (UNLIKELY(bytes < 0)) return; // needed in RELEASE, hits DCHECK in DEBUG - ChangeConsumption(-bytes, end_tracker); - } - - /// Increases consumption of this tracker and its ancestors by 'bytes' only if - /// they can all consume 'bytes' without exceeding limit (hard or soft) specified - /// by 'mode'. If any limit would be exceed, no MemTrackers are updated. If the - /// caller can tolerate an allocation failing, it should set mode=SOFT so that - /// other callers that may not tolerate allocation failures have a better chance - /// of success. Returns true if the consumption was successfully updated. + // Increases consumption of this tracker and its ancestors by 'bytes' only if + // they can all consume 'bytes' without exceeding limit. If limit would be exceed, + // no MemTrackers are updated. Returns true if the consumption was successfully updated. WARN_UNUSED_RESULT - Status TryConsume(int64_t bytes, MemLimit mode = MemLimit::HARD) { - // DCHECK_GE(bytes, 0); + Status try_consume(int64_t bytes) { if (bytes <= 0) { - Release(-bytes); + release(-bytes); return Status::OK(); } - if (MemInfo::current_mem() + bytes >= MemInfo::mem_limit()) { - return Status::MemoryLimitExceeded(fmt::format( - "{}: TryConsume failed, bytes={} process whole consumption={} mem limit={}", - label_, bytes, MemInfo::current_mem(), MemInfo::mem_limit())); - } - // if (UNLIKELY(bytes == 0)) return true; - // if (UNLIKELY(bytes < 0)) return false; // needed in RELEASE, hits DCHECK in DEBUG - if (UNLIKELY(consumption_metric_ != nullptr)) RefreshConsumptionFromMetric(); + RETURN_IF_ERROR(check_sys_mem_info(bytes)); int i; // Walk the tracker tree top-down. - for (i = all_trackers_.size() - 1; i >= 0; --i) { - MemTracker* tracker = all_trackers_[i]; - const int64_t limit = tracker->GetLimit(mode); - if (limit < 0) { - tracker->consumption_->add(bytes); // No limit at this tracker. + for (i = _all_trackers.size() - 1; i >= 0; --i) { + MemTracker* tracker = _all_trackers[i]; + if (tracker->limit() < 0) { + tracker->_consumption->add(bytes); // No limit at this tracker. } else { // If TryConsume fails, we can try to GC, but we may need to try several times if // there are concurrent consumers because we don't take a lock before trying to - // update consumption_. + // update _consumption. while (true) { - if (LIKELY(tracker->consumption_->try_add(bytes, limit))) break; - - if (UNLIKELY(tracker->GcMemory(limit - bytes))) { - DCHECK_GE(i, 0); + if (LIKELY(tracker->_consumption->try_add(bytes, tracker->limit()))) break; + Status st = tracker->try_gc_memory(bytes); + if (!st) { // Failed for this mem tracker. Roll back the ones that succeeded. - for (int j = all_trackers_.size() - 1; j > i; --j) { - all_trackers_[j]->consumption_->add(-bytes); + for (int j = _all_trackers.size() - 1; j > i; --j) { + _all_trackers[j]->_consumption->add(-bytes); } - return Status::MemoryLimitExceeded(fmt::format( - "{}: TryConsume failed, bytes={} consumption={} imit={} " - "attempting to GC", - tracker->label(), bytes, tracker->consumption_->current_value(), - limit)); + return st; } - VLOG_NOTICE << "GC succeeded, TryConsume bytes=" << bytes - << " consumption=" << tracker->consumption_->current_value() - << " limit=" << limit; } } } @@ -218,69 +159,116 @@ class MemTracker : public std::enable_shared_from_this { return Status::OK(); } - /// Decreases consumption of this tracker and its ancestors by 'bytes'. - void Release(int64_t bytes) { - // DCHECK_GE(bytes, 0); - if (bytes < 0) { - Consume(-bytes); - return; + int64_t add_untracked_mem(int64_t bytes) { + _untracked_mem += bytes; + if (std::abs(_untracked_mem) >= config::mem_tracker_consume_min_size_bytes) { // || + // _untracked_mem <= -config::mem_tracker_consume_min_size_bytes) { + // std::lock_guard l(_untracked_mem_lock); + // int64_t consume_bytes = _untracked_mem; + // _untracked_mem -= consume_bytes; + // return consume_bytes; + + return _untracked_mem.exchange(0); + + // do { + // consume_bytes = _untracked_mem; + // } while (!_untracked_mem.compare_exchange_weak(consume_bytes, 0)); + // return consume_bytes; } + return 0; + } - if (bytes == 0) { - return; + void release_cache(int64_t bytes) { + int64_t consume_bytes = add_untracked_mem(bytes); + if (consume_bytes != 0) { + release(consume_bytes); } + } - // if (UNLIKELY(bytes <= 0)) return; // < 0 needed in RELEASE, hits DCHECK in DEBUG + void consume_cache(int64_t bytes) { + int64_t consume_bytes = add_untracked_mem(bytes); + if (consume_bytes != 0) { + consume(consume_bytes); + } + // _untracked_mem += bytes; + // if (std::abs(_untracked_mem) >= config::mem_tracker_consume_min_size_bytes) { + // consume(_untracked_mem.exchange(0)); + // } + // consume(_untracked_mem.exchange(0)); + } - if (UNLIKELY(consumption_metric_ != nullptr)) { - RefreshConsumptionFromMetric(); - return; + WARN_UNUSED_RESULT + Status try_consume_cache(int64_t bytes) { + if (bytes <= 0) { + release_cache(-bytes); + return Status::OK(); } - for (auto& tracker : all_trackers_) { - tracker->consumption_->add(-bytes); - /// If a UDF calls FunctionContext::TrackAllocation() but allocates less than the - /// reported amount, the subsequent call to FunctionContext::Free() may cause the - /// process mem tracker to go negative until it is synced back to the tcmalloc - /// metric. Don't blow up in this case. (Note that this doesn't affect non-process - /// trackers since we can enforce that the reported memory usage is internally - /// consistent.) - if (LIKELY(tracker->consumption_metric_ == nullptr)) { - DCHECK_GE(tracker->consumption_->current_value(), 0) - << std::endl - << tracker->LogUsage(UNLIMITED_DEPTH); + int64_t consume_bytes = add_untracked_mem(bytes); + if (consume_bytes != 0) { + Status st = try_consume(consume_bytes); + if (!st) { + _untracked_mem += consume_bytes; + return st; } } + return Status::OK(); } - /// Transfer 'bytes' of consumption from this tracker to 'dst', updating - /// all ancestors up to the first shared ancestor. Must not be used if - /// 'dst' has a limit, or an ancestor with a limit, that is not a common - /// ancestor with the tracker, because this does not check memory limits. - void TransferTo(MemTracker* dst, int64_t bytes); - - /// Returns true if a valid limit of this tracker or one of its ancestors is - /// exceeded. - bool AnyLimitExceeded(MemLimit mode) { - for (const auto& tracker : limit_trackers_) { - if (tracker->LimitExceeded(mode)) { - return true; + // Decreases consumption of this tracker and its ancestors by 'bytes'. + // up to (but not including) end_tracker. + void release(int64_t bytes, MemTracker* end_tracker = nullptr) { + if (bytes < 0) { + consume(-bytes, end_tracker); + return; + } + if (bytes == 0) { + return; + } + for (auto& tracker : _all_trackers) { + if (tracker == end_tracker) return; + tracker->_consumption->add(-bytes); + memory_leak_check(tracker); + } + } + + static Status batch_consume(int64_t bytes, + const std::vector>& trackers) { + for (auto& tracker : trackers) { + Status st = tracker->try_consume(bytes); + if (!st) { + return st; } } - return false; + return Status::OK(); + } + + // Transfer 'bytes' of consumption from this tracker to 'dst'. + // updating all ancestors up to the first shared ancestor. Must not be used if + // 'dst' has a limit, or an ancestor with a limit, that is not a common + // ancestor with the tracker, because this does not check memory limits. + void transfer_to_relative(const std::shared_ptr& dst, int64_t bytes); + + WARN_UNUSED_RESULT + Status try_transfer_to(const std::shared_ptr& dst, int64_t bytes) { + // Must release first, then consume + consume_cache(-bytes); + Status st = dst->try_consume_cache(bytes); + if (!st) { + consume_cache(bytes); + return st; + } + return Status::OK(); } - /// If this tracker has a limit, checks the limit and attempts to free up some memory if - /// the hard limit is exceeded by calling any added GC functions. Returns true if the - /// limit is exceeded after calling the GC functions. Returns false if there is no limit - /// or consumption is under the limit. - bool LimitExceeded(MemLimit mode) { - if (UNLIKELY(CheckLimitExceeded(mode))) return LimitExceededSlow(mode); - return false; + // Forced transfer, 'dst' may limit exceed, and more ancestor trackers will be updated. + void transfer_to(const std::shared_ptr& dst, int64_t bytes) { + consume_cache(-bytes); + dst->consume_cache(bytes); } - // Return limit exceeded tracker or null - MemTracker* find_limit_exceeded_tracker() { - for (const auto& tracker : limit_trackers_) { + // Returns true if a valid limit of this tracker or one of its ancestors is exceeded. + MemTracker* limit_exceeded_tracker() const { + for (const auto& tracker : _limit_trackers) { if (tracker->limit_exceeded()) { return tracker; } @@ -288,66 +276,68 @@ class MemTracker : public std::enable_shared_from_this { return nullptr; } - /// Returns the maximum consumption that can be made without exceeding the limit on - /// this tracker or any of its parents. Returns int64_t::max() if there are no - /// limits and a negative value if any limit is already exceeded. - int64_t SpareCapacity(MemLimit mode) const; + bool any_limit_exceeded() const { return limit_exceeded_tracker() != nullptr; } - /// Refresh the memory consumption value from the consumption metric. Only valid to - /// call if this tracker has a consumption metric. - void RefreshConsumptionFromMetric(); - - // TODO(yingchun): following functions are old style which have no MemLimit parameter - bool limit_exceeded() const { return limit_ >= 0 && limit_ < consumption(); } - - int64_t limit() const { return limit_; } - bool has_limit() const { return limit_ >= 0; } + // Returns the maximum consumption that can be made without exceeding the limit on + // this tracker or any of its parents. Returns int64_t::max() if there are no + // limits and a negative value if any limit is already exceeded. + int64_t spare_capacity() const { + int64_t result = std::numeric_limits::max(); + for (const auto& tracker : _limit_trackers) { + int64_t mem_left = tracker->limit() - tracker->consumption(); + result = std::min(result, mem_left); + } + return result; + } - int64_t soft_limit() const { return soft_limit_; } - int64_t GetLimit(MemLimit mode) const { - if (mode == MemLimit::SOFT) return soft_limit(); - DCHECK_ENUM_EQ(mode, MemLimit::HARD); - return limit(); + // Returns the lowest limit for this tracker and its ancestors. Returns -1 if there is no limit. + int64_t get_lowest_limit() const { + if (_limit_trackers.empty()) return -1; + int64_t min_limit = std::numeric_limits::max(); + for (const auto& tracker : _limit_trackers) { + DCHECK(tracker->has_limit()); + min_limit = std::min(min_limit, tracker->limit()); + } + return min_limit; } - const std::string& label() const { return label_; } - /// Returns the lowest limit for this tracker and its ancestors. Returns - /// -1 if there is no limit. - int64_t GetLowestLimit(MemLimit mode) const; + bool limit_exceeded() const { return _limit >= 0 && _limit < consumption(); } + int64_t limit() const { return _limit; } + void set_limit(int64_t limit) { _limit = limit; } + bool has_limit() const { return _limit >= 0; } - /// Returns the memory 'reserved' by this resource pool mem tracker, which is the sum - /// of the memory reserved by the queries in it (i.e. its child trackers). The mem - /// reserved for a query that is currently executing is its limit_, if set (which - /// should be the common case with admission control). Otherwise, if the query has - /// no limit or the query is finished executing, the current consumption is used. - int64_t GetPoolMemReserved(); + Status check_limit(int64_t bytes) { + if (bytes <= 0) return Status::OK(); + RETURN_IF_ERROR(check_sys_mem_info(bytes)); + int i; + // Walk the tracker tree top-down. + for (i = _all_trackers.size() - 1; i >= 0; --i) { + MemTracker* tracker = _all_trackers[i]; + if (tracker->limit() > 0) { + while (true) { + if (LIKELY(tracker->_consumption->current_value() + bytes < tracker->limit())) + break; + RETURN_IF_ERROR(tracker->try_gc_memory(bytes)); + } + } + } + return Status::OK(); + } - /// Returns the memory consumed in bytes. - int64_t consumption() const { return consumption_->current_value(); } + const std::string& label() const { return _label; } - /// Note that if consumption_ is based on consumption_metric_, this will the max value - /// we've recorded in consumption(), not necessarily the highest value - /// consumption_metric_ has ever reached. - int64_t peak_consumption() const { return consumption_->value(); } + // Returns the memory consumed in bytes. + int64_t consumption() const { return _consumption->current_value(); } + int64_t peak_consumption() const { return _consumption->value(); } - std::shared_ptr parent() const { return parent_; } + std::shared_ptr parent() const { return _parent; } - /// Signature for function that can be called to free some memory after limit is - /// reached. The function should try to free at least 'bytes_to_free' bytes of - /// memory. See the class header for further details on the expected behaviour of - /// these functions. typedef std::function GcFunction; - /// Add a function 'f' to be called if the limit is reached, if none of the other /// previously-added GC functions were successful at freeing up enough memory. /// 'f' does not need to be thread-safe as long as it is added to only one MemTracker. /// Note that 'f' must be valid for the lifetime of this MemTracker. - void AddGcFunction(GcFunction f); - - /// Register this MemTracker's metrics. Each key will be of the form - /// ".". - // TODO(yingchun): remove comments - //void RegisterMetrics(MetricGroup* metrics, const std::string& prefix); + void add_gc_function(GcFunction f) { _gc_functions.push_back(f); } /// Logs the usage of this tracker and optionally its children (recursively). /// If 'logged_consumption' is non-nullptr, sets the consumption value logged. @@ -355,251 +345,157 @@ class MemTracker : public std::enable_shared_from_this { /// to include in the dump. If it is zero, then no children are dumped. /// Limiting the recursive depth reduces the cost of dumping, particularly /// for the process MemTracker. - /// TODO: once all memory is accounted in ReservationTracker hierarchy, move - /// reporting there. - std::string LogUsage(int max_recursive_depth, const std::string& prefix = "", - int64_t* logged_consumption = nullptr); - /// Dumping the process MemTracker is expensive. Limiting the recursive depth - /// to two levels limits the level of detail to a one-line summary for each query - /// MemTracker, avoiding all MemTrackers below that level. This provides a summary - /// of process usage with substantially lower cost than the full dump. - static const int PROCESS_MEMTRACKER_LIMITED_DEPTH = 2; - /// Unlimited dumping is useful for query memtrackers or error conditions that - /// are not performance sensitive - static const int UNLIMITED_DEPTH = INT_MAX; - - /// Logs the usage of 'limit' number of queries based on maximum total memory - /// consumption. - std::string LogTopNQueries(int limit); + std::string log_usage(int max_recursive_depth = INT_MAX, int64_t* logged_consumption = nullptr); /// Log the memory usage when memory limit is exceeded and return a status object with /// details of the allocation which caused the limit to be exceeded. /// If 'failed_allocation_size' is greater than zero, logs the allocation size. If /// 'failed_allocation_size' is zero, nothing about the allocation size is logged. /// If 'state' is non-nullptr, logs the error to 'state'. - Status MemLimitExceeded(RuntimeState* state, const std::string& details, - int64_t failed_allocation = 0) WARN_UNUSED_RESULT { - return MemLimitExceeded(this, state, details, failed_allocation); - } - - /// Makes MemLimitExceeded callable for nullptr MemTrackers. - static Status MemLimitExceeded(MemTracker* mtracker, RuntimeState* state, - const std::string& details, - int64_t failed_allocation = 0) WARN_UNUSED_RESULT; - - static void update_limits(int64_t bytes, - const std::vector>& trackers) { - for (auto& tracker : trackers) { - tracker->Consume(bytes); + Status mem_limit_exceeded(RuntimeState* state, const std::string& details = std::string(), + int64_t failed_allocation = -1, + Status failed_alloc = Status::OK()) WARN_UNUSED_RESULT; + + // If an ancestor of this tracker is a Task MemTracker, return that tracker. Otherwise return nullptr. + MemTracker* parent_task_mem_tracker() { + MemTracker* tracker = this; + while (tracker != nullptr && tracker->_level != MemTrackerLevel::TASK) { + tracker = tracker->_parent.get(); } + return tracker; } - static bool limit_exceeded(const std::vector>& trackers) { - for (const auto& tracker : trackers) { - if (tracker->limit_exceeded()) { - // TODO: remove logging - LOG(WARNING) << "exceeded limit: limit=" << tracker->limit() - << " consumption=" << tracker->consumption(); - return true; - } + bool has_virtual_ancestor() { + MemTracker* tracker = this; + while (tracker != nullptr && tracker->_virtual == false) { + tracker = tracker->_parent.get(); } + return tracker == nullptr ? false : true; + } - return false; + std::string id() { + return _id; } std::string debug_string() { std::stringstream msg; - msg << "limit: " << limit_ << "; " - << "consumption: " << consumption_->current_value() << "; " - << "label: " << label_ << "; " - << "all tracker size: " << all_trackers_.size() << "; " - << "limit trackers size: " << limit_trackers_.size() << "; " - << "parent is null: " << ((parent_ == nullptr) ? "true" : "false") << "; "; + msg << "limit: " << _limit << "; " + << "consumption: " << _consumption->current_value() << "; " + << "label: " << _label << "; " + << "all tracker size: " << _all_trackers.size() << "; " + << "limit trackers size: " << _limit_trackers.size() << "; " + << "parent is null: " << ((_parent == nullptr) ? "true" : "false") << "; "; return msg.str(); } - bool is_consumption_metric_null() const { return consumption_metric_ == nullptr; } - static const std::string COUNTER_NAME; private: /// 'byte_limit' < 0 means no limit - /// 'label' is the label used in the usage string (LogUsage()) - /// If 'log_usage_if_zero' is false, this tracker (and its children) will not be - /// included in LogUsage() output if consumption is 0. - MemTracker(RuntimeProfile* profile, int64_t byte_limit, const std::string& label, - const std::shared_ptr& parent, bool log_usage_if_zero, MemTrackerLevel); + /// 'label' is the label used in the usage string (log_usage()) + MemTracker(int64_t byte_limit, const std::string& label, + const std::shared_ptr& parent, MemTrackerLevel, RuntimeProfile* profile); private: - friend class PoolMemTrackerRegistry; - - // TODO(HW): remove later - /// Closes this MemTracker. After closing it is invalid to consume memory on this - /// tracker and the tracker's consumption counter (which may be owned by a - /// RuntimeProfile, not this MemTracker) can be safely destroyed. MemTrackers without - /// consumption metrics in the context of a daemon must always be closed. - /// Idempotent: calling multiple times has no effect. - void Close(); - - /// Returns true if the current memory tracker's limit is exceeded. - bool CheckLimitExceeded(MemLimit mode) const { - int64_t limit = GetLimit(mode); - return limit >= 0 && limit < consumption(); + // If consumption is higher than max_consumption, attempts to free memory by calling + // any added GC functions. Returns true if max_consumption is still exceeded. Takes gc_lock. + bool gc_memory(int64_t max_consumption); + + inline Status try_gc_memory(int64_t bytes) { + if (UNLIKELY(gc_memory(_limit - bytes))) { + return Status::MemoryLimitExceeded( + fmt::format("label={} TryConsume failed size={}, used={}, limit={}", label(), + bytes, _consumption->current_value(), _limit)); + } + VLOG_NOTICE << "GC succeeded, TryConsume bytes=" << bytes + << " consumption=" << _consumption->current_value() << " limit=" << _limit; + return Status::OK(); } - /// Slow path for LimitExceeded(). - bool LimitExceededSlow(MemLimit mode); - - /// If consumption is higher than max_consumption, attempts to free memory by calling - /// any added GC functions. Returns true if max_consumption is still exceeded. Takes - /// gc_lock. Updates metrics if initialized. - bool GcMemory(int64_t max_consumption); + // Walks the MemTracker hierarchy and populates _all_trackers and + // limit_trackers_ + void init(); + void init_virtual(); - /// Walks the MemTracker hierarchy and populates all_trackers_ and - /// limit_trackers_ - void Init(); - - /// Adds tracker to child_trackers_ - void AddChildTracker(const std::shared_ptr& tracker); + // Adds tracker to _child_trackers + void add_child_tracker(const std::shared_ptr& tracker) { + std::lock_guard l(_child_trackers_lock); + tracker->_child_tracker_it = _child_trackers.insert(_child_trackers.end(), tracker); + } /// Log consumption of all the trackers provided. Returns the sum of consumption in /// 'logged_consumption'. 'max_recursive_depth' specifies the maximum number of levels /// of children to include in the dump. If it is zero, then no children are dumped. - static std::string LogUsage(int max_recursive_depth, const std::string& prefix, - const std::list>& trackers, - int64_t* logged_consumption); - - /// Helper function for LogTopNQueries that iterates through the MemTracker hierarchy - /// and populates 'min_pq' with 'limit' number of elements (that contain state related - /// to query MemTrackers) based on maximum total memory consumption. - void GetTopNQueries(std::priority_queue, - std::vector>, - std::greater>>& min_pq, - int limit); - - /// If an ancestor of this tracker is a query MemTracker, return that tracker. - /// Otherwise return nullptr. - MemTracker* GetQueryMemTracker(); - - /// Increases/Decreases the consumption of this tracker and the ancestors up to (but - /// not including) end_tracker. - void ChangeConsumption(int64_t bytes, MemTracker* end_tracker) { - DCHECK(consumption_metric_ == nullptr) << "Should not be called on root."; - for (MemTracker* tracker : all_trackers_) { - if (tracker == end_tracker) return; - DCHECK(!tracker->has_limit()) << tracker->label() << " have limit:" << tracker->limit(); - tracker->consumption_->add(bytes); + static std::string log_usage(int max_recursive_depth, + const std::list>& trackers, + int64_t* logged_consumption); + + // Usually, a negative values means that the statistics are not accurate, + // 1. The released memory is not consumed. + // 2. The same block of memory, tracker A calls consume, and tracker B calls release. + // 3. Repeated releases of MemTacker. When the consume is called on the child MemTracker, + // after the release is called on the parent MemTracker, + // the child ~MemTracker will cause repeated releases. + // + // But TCMalloc Hook will cache a batch of untracked values ​​when it consumes/releases + // MemTracker, which may cause tracker->consumption to be temporarily less than 0. + // so a small range of negative values ​​is allowed, because, this may obscure above errors. + // + // A query corresponds to multiple threads, and each thread may have + // config::mem_tracker_consume_min_size_bytes is not consumed. Here, 100 is just a guess. + void memory_leak_check(MemTracker* tracker) { + if (config::memory_leak_detection) { + DCHECK_GE(tracker->_consumption->current_value(), + -config::mem_tracker_consume_min_size_bytes * 1024) + << std::endl + << tracker->log_usage(); } - DCHECK(false) << "end_tracker is not an ancestor"; } - // Creates the root tracker. - static void CreateRootTracker(); + // Creates the process tracker. + static void create_process_tracker(); - /// Lock to protect GcMemory(). This prevents many GCs from occurring at once. - std::mutex gc_lock_; + // Limit on memory consumption, in bytes. If limit_ == -1, there is no consumption limit. + int64_t _limit; - /// Only used if 'is_query_mem_tracker_' is true. - /// 0 if the query is still executing or 1 if it has finished executing. Before - /// it has finished executing, the tracker limit is treated as "reserved memory" - /// for the purpose of admission control - see GetPoolMemReserved(). - std::atomic query_exec_finished_ {0}; + std::string _label; - /// Only valid for MemTrackers returned from GetRequestPoolMemTracker() - std::string pool_name_; + std::string _id; - /// Hard limit on memory consumption, in bytes. May not be exceeded. If limit_ == -1, - /// there is no consumption limit. - const int64_t limit_; + std::shared_ptr _parent; // The parent of this tracker. - /// Soft limit on memory consumption, in bytes. Can be exceeded but callers to - /// TryConsume() can opt not to exceed this limit. If -1, there is no consumption limit. - const int64_t soft_limit_; - - std::string label_; - - /// The parent of this tracker. The pointer is never modified, even after this tracker - /// is unregistered. - std::shared_ptr parent_; + MemTrackerLevel _level; - /// in bytes - std::shared_ptr consumption_; + bool _virtual = false; - /// If non-nullptr, used to measure consumption (in bytes) rather than the values provided - /// to Consume()/Release(). Only used for the process tracker, thus parent_ should be - /// nullptr if consumption_metric_ is set. - IntGauge* consumption_metric_; + std::shared_ptr _consumption; // in bytes - /// If non-nullptr, counters from a corresponding ReservationTracker that should be - /// reported in logs and other diagnostics. Owned by this MemTracker. The counters - /// are owned by the fragment's RuntimeProfile. - AtomicPtr reservation_counters_; + // Consume size smaller than mem_tracker_consume_min_size_bytes will continue to accumulate + // to avoid frequent calls to consume/release of MemTracker. + std::atomic _untracked_mem = 0; + SpinLock _untracked_mem_lock; - std::vector all_trackers_; // this tracker plus all of its ancestors - std::vector limit_trackers_; // all_trackers_ with valid limits + std::vector _all_trackers; // this tracker plus all of its ancestors + std::vector _limit_trackers; // _all_trackers with valid limits // All the child trackers of this tracker. Used for error reporting and // listing only (i.e. updating the consumption of a parent tracker does not // update that of its children). - SpinLock child_trackers_lock_; - std::list> child_trackers_; - - /// Iterator into parent_->child_trackers_ for this object. Stored to have O(1) - /// remove. - std::list>::iterator child_tracker_it_; - - /// Functions to call after the limit is reached to free memory. - std::vector gc_functions_; - - /// If false, this tracker (and its children) will not be included in LogUsage() output - /// if consumption is 0. - bool log_usage_if_zero_; - - MemTrackerLevel _level; - - /// The number of times the GcFunctions were called. - IntCounter* num_gcs_metric_; - - /// The number of bytes freed by the last round of calling the GcFunctions (-1 before any - /// GCs are performed). - IntGauge* bytes_freed_by_last_gc_metric_; - - /// The number of bytes over the limit we were the last time LimitExceeded() was called - /// and the limit was exceeded pre-GC. -1 if there is no limit or the limit was never - /// exceeded. - IntGauge* bytes_over_limit_metric_; - - /// Metric for limit_. - IntGauge* limit_metric_; + SpinLock _child_trackers_lock; + std::list> _child_trackers; + // Iterator into parent_->child_trackers_ for this object. Stored to have O(1) remove. + std::list>::iterator _child_tracker_it; + + // Lock to protect gc_memory(). This prevents many GCs from occurring at once. + std::mutex _gc_lock; + // Functions to call after the limit is reached to free memory. + std::vector _gc_functions; }; -/// Global registry for query and pool MemTrackers. Owned by ExecEnv. -class PoolMemTrackerRegistry { -public: - /// Returns a MemTracker object for request pool 'pool_name'. Calling this with the same - /// 'pool_name' will return the same MemTracker object. This is used to track the local - /// memory usage of all requests executing in this pool. If 'create_if_not_present' is - /// true, the first time this is called for a pool, a new MemTracker object is created - /// with the process tracker as its parent. There is no explicit per-pool byte_limit - /// set at any particular impalad, so newly created trackers will always have a limit - /// of -1. - /// TODO(cmy): this function is not used for now. the memtracker returned from here is - /// got from a shared_ptr in `pool_to_mem_trackers_`. - /// This funtion is from - /// https://github.com/cloudera/Impala/blob/495397101e5807c701df71ea288f4815d69c2c8a/be/src/runtime/mem-tracker.h#L497 - /// And in impala this function will return a raw pointer. - std::shared_ptr GetRequestPoolMemTracker(const std::string& pool_name, - bool create_if_not_present); - -private: - /// All per-request pool MemTracker objects. It is assumed that request pools will live - /// for the entire duration of the process lifetime so MemTrackers are never removed - /// from this map. Protected by '_pool_to_mem_trackers_lock' - typedef std::unordered_map> PoolTrackersMap; - PoolTrackersMap pool_to_mem_trackers_; - /// IMPALA-3068: Use SpinLock instead of std::mutex so that the lock won't - /// automatically destroy itself as part of process teardown, which could cause races. - SpinLock pool_to_mem_trackers_lock_; -}; +#define RETURN_LIMIT_EXCEEDED(tracker, state, msg) return tracker->mem_limit_exceeded(state, msg); +#define RETURN_ALLOC_LIMIT_EXCEEDED(tracker, state, msg, size, st) \ + return tracker->mem_limit_exceeded(state, msg, size, st); +#define RETURN_IF_LIMIT_EXCEEDED(tracker, state, msg) \ + if (tracker->any_limit_exceeded()) RETURN_LIMIT_EXCEEDED(tracker, state, msg); } // namespace doris diff --git a/be/src/runtime/mem_tracker_task_pool.cpp b/be/src/runtime/mem_tracker_task_pool.cpp new file mode 100644 index 00000000000000..9c46eac8382f91 --- /dev/null +++ b/be/src/runtime/mem_tracker_task_pool.cpp @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/mem_tracker_task_pool.h" + +#include "common/config.h" +#include "runtime/exec_env.h" +#include "util/pretty_printer.h" + +namespace doris { + +std::shared_ptr MemTrackerTaskPool::register_task_mem_tracker_impl( + const std::string& task_id, int64_t mem_limit, const std::string& label, + std::shared_ptr parent) { + DCHECK(!task_id.empty()); + // First time this task_id registered, make a new object, otherwise do nothing. + // Combine create_tracker and emplace into one operation to avoid the use of locks + // Name for task MemTrackers. '$0' is replaced with the task id. + _task_mem_trackers.try_emplace_l( + task_id, [](std::shared_ptr) {}, + MemTracker::create_tracker(mem_limit, label, parent, MemTrackerLevel::TASK)); + std::shared_ptr tracker = get_task_mem_tracker(task_id); + return tracker; +} + +std::shared_ptr MemTrackerTaskPool::register_query_mem_tracker( + const std::string& query_id, int64_t mem_limit) { + VLOG_FILE << "Register Query memory tracker, query id: " << query_id + << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); + return register_task_mem_tracker_impl(query_id, mem_limit, fmt::format("queryId={}", query_id), + ExecEnv::GetInstance()->query_pool_mem_tracker()); +} + +std::shared_ptr MemTrackerTaskPool::register_load_mem_tracker( + const std::string& load_id, int64_t mem_limit) { + VLOG_FILE << "Register Load memory tracker, load id: " << load_id + << " limit: " << PrettyPrinter::print(mem_limit, TUnit::BYTES); + return register_task_mem_tracker_impl(load_id, mem_limit, fmt::format("loadId={}", load_id), + ExecEnv::GetInstance()->load_pool_mem_tracker()); +} + +std::shared_ptr MemTrackerTaskPool::get_task_mem_tracker(const std::string& task_id) { + DCHECK(!task_id.empty()); + std::shared_ptr tracker = nullptr; + // Avoid using locks to resolve erase conflicts + _task_mem_trackers.if_contains(task_id, + [&tracker](std::shared_ptr v) { tracker = v; }); + return tracker; +} + +void MemTrackerTaskPool::logout_task_mem_tracker() { + std::vector expired_tasks; + for (auto it = _task_mem_trackers.begin(); it != _task_mem_trackers.end(); it++) { + // No RuntimeState uses this task MemTracker, it is only referenced by this map, delete it + if (it->second.use_count() == 1) { + if (!config::memory_leak_detection || it->second->consumption() == 0) { + // + it->second->parent()->consume(-it->second->consumption(), MemTracker::get_process_tracker().get()); + expired_tasks.emplace_back(it->first); + } else { + LOG(WARNING) << "Memory tracker " << it->second->debug_string() << " Memory leak " + << it->second->consumption(); + } + } + } + for (auto tid : expired_tasks) { + DCHECK(_task_mem_trackers[tid].use_count() == 1); + _task_mem_trackers.erase(tid); + VLOG_FILE << "Deregister task memory tracker, task id: " << tid; + } +} + +// TODO(zxy) +// /// Logs the usage of 'limit' number of queries based on maximum total memory +// /// consumption. +// std::string MemTracker::LogTopNQueries(int limit) { +// if (limit == 0) return ""; +// priority_queue, std::vector>, +// std::greater>> +// min_pq; +// GetTopNQueries(min_pq, limit); +// std::vector usage_strings(min_pq.size()); +// while (!min_pq.empty()) { +// usage_strings.push_back(min_pq.top().second); +// min_pq.pop(); +// } +// std::reverse(usage_strings.begin(), usage_strings.end()); +// return join(usage_strings, "\n"); +// } + +// /// Helper function for LogTopNQueries that iterates through the MemTracker hierarchy +// /// and populates 'min_pq' with 'limit' number of elements (that contain state related +// /// to query MemTrackers) based on maximum total memory consumption. +// void MemTracker::GetTopNQueries( +// priority_queue, std::vector>, +// greater>>& min_pq, +// int limit) { +// list> children; +// { +// lock_guard l(child_trackers_lock_); +// children = child_trackers_; +// } +// for (const auto& child_weak : children) { +// shared_ptr child = child_weak.lock(); +// if (child) { +// child->GetTopNQueries(min_pq, limit); +// } +// } +// } + +} // namespace doris diff --git a/be/src/runtime/mem_tracker_task_pool.h b/be/src/runtime/mem_tracker_task_pool.h new file mode 100644 index 00000000000000..0cf566446dadb0 --- /dev/null +++ b/be/src/runtime/mem_tracker_task_pool.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/mem_tracker.h" + +namespace doris { + +// Global task pool for query MemTrackers. Owned by ExecEnv. +class MemTrackerTaskPool { +public: + // Construct a MemTracker object for 'task_id' with 'mem_limit' as the memory limit. + // The MemTracker is a child of the pool MemTracker, Calling this with the same + // 'task_id' will return the same MemTracker object. This is used to track the local + // memory usage of all tasks executing. The first time this is called for a task, + // a new MemTracker object is created with the pool tracker as its parent. + // Newly created trackers will always have a limit of -1. + std::shared_ptr register_task_mem_tracker_impl(const std::string& task_id, + int64_t mem_limit, + const std::string& label, + std::shared_ptr parent); + std::shared_ptr register_query_mem_tracker(const std::string& query_id, + int64_t mem_limit); + std::shared_ptr register_load_mem_tracker(const std::string& load_id, + int64_t mem_limit); + + std::shared_ptr get_task_mem_tracker(const std::string& task_id); + + void logout_task_mem_tracker(); + +private: + // All per-task MemTracker objects. + // The life cycle of task memtracker in the process is the same as task runtime state, + // MemTrackers will be removed from this map after query finish or cancel. + using TaskTrackersMap = phmap::parallel_flat_hash_map< + std::string, std::shared_ptr, phmap::priv::hash_default_hash, + phmap::priv::hash_default_eq, + std::allocator>>, 12, + std::mutex>; + + TaskTrackersMap _task_mem_trackers; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/runtime/memory/chunk.h b/be/src/runtime/memory/chunk.h index 332631d3fba1aa..249136ad29af78 100644 --- a/be/src/runtime/memory/chunk.h +++ b/be/src/runtime/memory/chunk.h @@ -22,6 +22,8 @@ namespace doris { +class MemTracker; + // A chunk of continuous memory. // Almost all files depend on this struct, and each modification // will result in recompilation of all files. So, we put it in a diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp index cbc2462953c882..1082d577c8dc3f 100644 --- a/be/src/runtime/memory/chunk_allocator.cpp +++ b/be/src/runtime/memory/chunk_allocator.cpp @@ -22,8 +22,10 @@ #include #include "gutil/dynamic_annotations.h" +#include "runtime/mem_tracker.h" #include "runtime/memory/chunk.h" #include "runtime/memory/system_allocator.h" +#include "runtime/thread_context.h" #include "util/bit_util.h" #include "util/cpu_info.h" #include "util/doris_metrics.h" @@ -114,6 +116,9 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) : _reserve_bytes_limit(reserve_limit), _reserved_bytes(0), _arenas(CpuInfo::get_max_num_cores()) { + _mem_tracker = + MemTracker::create_tracker(-1, "ChunkAllocator", nullptr, MemTrackerLevel::OVERVIEW); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int i = 0; i < _arenas.size(); ++i) { _arenas[i].reset(new ChunkArena()); } @@ -128,8 +133,15 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit) INT_COUNTER_METRIC_REGISTER(_chunk_allocator_metric_entity, chunk_pool_system_free_cost_ns); } -bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { +Status ChunkAllocator::allocate(size_t size, Chunk* chunk, const std::shared_ptr& tracker, bool check_limits) { // fast path: allocate from current core arena + std::shared_ptr reset_tracker = tracker ? tracker : thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker(); + if (check_limits) { + RETURN_IF_ERROR(_mem_tracker->try_transfer_to(reset_tracker, size)); + } else { + _mem_tracker->transfer_to(reset_tracker, size); + } + int core_id = CpuInfo::get_current_core(); chunk->size = size; chunk->core_id = core_id; @@ -138,7 +150,7 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { DCHECK_GE(_reserved_bytes, 0); _reserved_bytes.fetch_sub(size); chunk_pool_local_core_alloc_count->increment(1); - return true; + return Status::OK(); } if (_reserved_bytes > size) { // try to allocate from other core's arena @@ -150,29 +162,38 @@ bool ChunkAllocator::allocate(size_t size, Chunk* chunk) { chunk_pool_other_core_alloc_count->increment(1); // reset chunk's core_id to other chunk->core_id = core_id % _arenas.size(); - return true; + return Status::OK(); } } } int64_t cost_ns = 0; { + // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); SCOPED_RAW_TIMER(&cost_ns); // allocate from system allocator + // _mem_tracker->consume_cache(size); chunk->data = SystemAllocator::allocate(size); } chunk_pool_system_alloc_count->increment(1); chunk_pool_system_alloc_cost_ns->increment(cost_ns); if (chunk->data == nullptr) { - return false; + reset_tracker->transfer_to(_mem_tracker, size); + return Status::MemoryAllocFailed( + fmt::format("ChunkAllocator failed to allocate chunk {} bytes", size)); } - return true; + return Status::OK(); } -void ChunkAllocator::free(const Chunk& chunk) { +void ChunkAllocator::free(Chunk& chunk, const std::shared_ptr& tracker) { if (chunk.core_id == -1) { return; } + if (tracker) { + tracker->transfer_to(_mem_tracker, chunk.size); + } else { + thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->transfer_to(_mem_tracker, chunk.size); + } int64_t old_reserved_bytes = _reserved_bytes; int64_t new_reserved_bytes = 0; do { @@ -180,7 +201,9 @@ void ChunkAllocator::free(const Chunk& chunk) { if (new_reserved_bytes > _reserve_bytes_limit) { int64_t cost_ns = 0; { + // SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); SCOPED_RAW_TIMER(&cost_ns); + // _mem_tracker->release_cache(chunk.size); SystemAllocator::free(chunk.data, chunk.size); } chunk_pool_system_free_count->increment(1); @@ -193,8 +216,8 @@ void ChunkAllocator::free(const Chunk& chunk) { _arenas[chunk.core_id]->push_free_chunk(chunk.data, chunk.size); } -bool ChunkAllocator::allocate_align(size_t size, Chunk* chunk) { - return allocate(BitUtil::RoundUpToPowerOfTwo(size), chunk); +Status ChunkAllocator::allocate_align(size_t size, Chunk* chunk, const std::shared_ptr& tracker, bool check_limits) { + return allocate(BitUtil::RoundUpToPowerOfTwo(size), chunk, tracker, check_limits); } } // namespace doris diff --git a/be/src/runtime/memory/chunk_allocator.h b/be/src/runtime/memory/chunk_allocator.h index d7eb22fefed8a3..366e66ccd57ece 100644 --- a/be/src/runtime/memory/chunk_allocator.h +++ b/be/src/runtime/memory/chunk_allocator.h @@ -28,6 +28,8 @@ namespace doris { class Chunk; class ChunkArena; class MetricEntity; +class MemTracker; +class Status; // Used to allocate memory with power-of-two length. // This Allocator allocate memory from system and cache free chunks for @@ -63,12 +65,14 @@ class ChunkAllocator { // Allocate a Chunk with a power-of-two length "size". // Return true if success and allocated chunk is saved in "chunk". // Otherwise return false. - bool allocate(size_t size, Chunk* chunk); + Status allocate(size_t size, Chunk* chunk, + const std::shared_ptr& tracker = std::shared_ptr(), bool check_limits = false); - bool allocate_align(size_t size, Chunk* chunk); + Status allocate_align(size_t size, Chunk* chunk, + const std::shared_ptr& tracker = std::shared_ptr(), bool check_limits = false); // Free chunk allocated from this allocator - void free(const Chunk& chunk); + void free(Chunk& chunk, const std::shared_ptr& tracker = std::shared_ptr()); private: static ChunkAllocator* _s_instance; @@ -79,6 +83,8 @@ class ChunkAllocator { std::vector> _arenas; std::shared_ptr _chunk_allocator_metric_entity; + + std::shared_ptr _mem_tracker; }; } // namespace doris diff --git a/be/src/runtime/memory_scratch_sink.h b/be/src/runtime/memory_scratch_sink.h index 658aa0eb6478ad..7f63f2cfb03bed 100644 --- a/be/src/runtime/memory_scratch_sink.h +++ b/be/src/runtime/memory_scratch_sink.h @@ -42,7 +42,6 @@ class RuntimeProfile; class BufferControlBlock; class ExprContext; class ResultWriter; -class MemTracker; class TupleRow; // used to push data to blocking queue diff --git a/be/src/runtime/mysql_table_sink.cpp b/be/src/runtime/mysql_table_sink.cpp index cb7911d9f2b904..0e5042c9b90afa 100644 --- a/be/src/runtime/mysql_table_sink.cpp +++ b/be/src/runtime/mysql_table_sink.cpp @@ -33,7 +33,7 @@ MysqlTableSink::MysqlTableSink(ObjectPool* pool, const RowDescriptor& row_desc, : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs), - _mem_tracker(MemTracker::CreateTracker(-1, "MysqlTableSink")) { + _mem_tracker(MemTracker::create_tracker(-1, "MysqlTableSink")) { _name = "MysqlTableSink"; } diff --git a/be/src/runtime/odbc_table_sink.cpp b/be/src/runtime/odbc_table_sink.cpp index b92b1517a0f5af..991ca64f0b6fc9 100644 --- a/be/src/runtime/odbc_table_sink.cpp +++ b/be/src/runtime/odbc_table_sink.cpp @@ -21,18 +21,14 @@ #include "exprs/expr.h" #include "runtime/runtime_state.h" -#include "runtime/mem_tracker.h" -#include "util/runtime_profile.h" #include "util/debug_util.h" +#include "util/runtime_profile.h" namespace doris { OdbcTableSink::OdbcTableSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& t_exprs) - : _pool(pool), - _row_desc(row_desc), - _t_output_expr(t_exprs), - _mem_tracker(MemTracker::CreateTracker(-1, "OdbcTableSink")) { + const std::vector& t_exprs) + : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs) { _name = "OOBC_TABLE_SINK"; } @@ -56,7 +52,7 @@ Status OdbcTableSink::init(const TDataSink& t_sink) { Status OdbcTableSink::prepare(RuntimeState* state) { RETURN_IF_ERROR(DataSink::prepare(state)); // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker)); + RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _expr_mem_tracker)); std::stringstream title; title << "ODBC_TABLE_SINK (frag_id=" << state->fragment_instance_id() << ")"; // create profile @@ -100,4 +96,4 @@ Status OdbcTableSink::close(RuntimeState* state, Status exec_status) { return Status::OK(); } -} +} // namespace doris diff --git a/be/src/runtime/odbc_table_sink.h b/be/src/runtime/odbc_table_sink.h index 385075b49aa658..3f9c8fd3b25ce6 100644 --- a/be/src/runtime/odbc_table_sink.h +++ b/be/src/runtime/odbc_table_sink.h @@ -32,7 +32,6 @@ class TOdbcTableSink; class RuntimeState; class RuntimeProfile; class ExprContext; -class MemTracker; //This class is a sinker, which put input data to odbc table class OdbcTableSink : public DataSink { @@ -73,9 +72,8 @@ class OdbcTableSink : public DataSink { bool _use_transaction; RuntimeProfile* _profile; - std::shared_ptr _mem_tracker; }; -} +} // namespace doris #endif diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 92aefc4d282588..ede3241885b432 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -34,13 +34,14 @@ #include "runtime/result_buffer_mgr.h" #include "runtime/result_queue_mgr.h" #include "runtime/row_batch.h" +#include "runtime/thread_context.h" #include "util/container_util.hpp" #include "util/cpu_info.h" +#include "util/logging.h" #include "util/mem_info.h" #include "util/parse_util.h" #include "util/pretty_printer.h" #include "util/uid_util.h" -#include "util/logging.h" #include "vec/core/block.h" #include "vec/exec/vexchange_node.h" @@ -59,6 +60,8 @@ PlanFragmentExecutor::PlanFragmentExecutor(ExecEnv* exec_env, _closed(false), _is_report_success(true), _is_report_on_cancel(true), + _cancel_reason(PPlanFragmentCancelReason::INTERNAL_ERROR), + _cancel_msg(""), _collect_query_statistics_with_every_batch(false) {} PlanFragmentExecutor::~PlanFragmentExecutor() { @@ -74,10 +77,12 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, const TPlanFragmentExecParams& params = request.params; _query_id = params.query_id; - TAG(LOG(INFO)).log("PlanFragmentExecutor::prepare") - .query_id(_query_id).instance_id(params.fragment_instance_id) - .tag("backend_num", std::to_string(request.backend_num)) - .tag("pthread_id", std::to_string((uintptr_t) pthread_self())); + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::prepare") + .query_id(_query_id) + .instance_id(params.fragment_instance_id) + .tag("backend_num", std::to_string(request.backend_num)) + .tag("pthread_id", std::to_string((uintptr_t)pthread_self())); // VLOG_CRITICAL << "request:\n" << apache::thrift::ThriftDebugString(request); const TQueryGlobals& query_globals = @@ -86,6 +91,10 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _runtime_state->set_query_fragments_ctx(fragments_ctx); RETURN_IF_ERROR(_runtime_state->init_mem_trackers(_query_id)); + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); + // SCOPED_ATTACH_TASK_THREAD_4ARGP(_runtime_state->query_type(), print_id(_runtime_state->query_id()), + // _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); _runtime_state->set_be_number(request.backend_num); if (request.__isset.backend_id) { _runtime_state->set_backend_id(request.backend_id); @@ -114,21 +123,14 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, bytes_limit = 2 * 1024 * 1024 * 1024L; } - if (bytes_limit > _exec_env->process_mem_tracker()->limit()) { + if (bytes_limit > MemTracker::get_process_tracker()->limit()) { LOG(WARNING) << "Query memory limit " << PrettyPrinter::print(bytes_limit, TUnit::BYTES) << " exceeds process memory limit of " - << PrettyPrinter::print(_exec_env->process_mem_tracker()->limit(), + << PrettyPrinter::print(MemTracker::get_process_tracker()->limit(), TUnit::BYTES) << ". Using process memory limit instead"; - bytes_limit = _exec_env->process_mem_tracker()->limit(); + bytes_limit = MemTracker::get_process_tracker()->limit(); } - // NOTE: this MemTracker only for olap - _mem_tracker = MemTracker::CreateTracker(bytes_limit, - "PlanFragmentExecutor:" + print_id(_query_id) + ":" + - print_id(params.fragment_instance_id), - _exec_env->process_mem_tracker(), true, false, - MemTrackerLevel::TASK); - _runtime_state->set_fragment_mem_tracker(_mem_tracker); RETURN_IF_ERROR(_runtime_state->create_block_mgr()); @@ -217,8 +219,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _rows_produced_counter = ADD_COUNTER(profile(), "RowsProduced", TUnit::UNIT); _fragment_cpu_timer = ADD_TIMER(profile(), "FragmentCpuTime"); - _row_batch.reset(new RowBatch(_plan->row_desc(), _runtime_state->batch_size(), - _runtime_state->instance_mem_tracker().get())); + _row_batch.reset(new RowBatch(_plan->row_desc(), _runtime_state->batch_size())); _block.reset(new doris::vectorized::Block()); // _row_batch->tuple_data_pool()->set_limits(*_runtime_state->mem_trackers()); VLOG_NOTICE << "plan_root=\n" << _plan->debug_string(); @@ -232,10 +233,13 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, } Status PlanFragmentExecutor::open() { - int64_t mem_limit = _runtime_state->fragment_mem_tracker()->limit(); - TAG(LOG(INFO)).log("PlanFragmentExecutor::open, using query memory limit: " + PrettyPrinter::print(mem_limit, TUnit::BYTES)) - .query_id(_query_id).instance_id(_runtime_state->fragment_instance_id()) - .tag("mem_limit", std::to_string(mem_limit)); + int64_t mem_limit = _runtime_state->instance_mem_tracker()->limit(); + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::open, using query memory limit: " + + PrettyPrinter::print(mem_limit, TUnit::BYTES)) + .query_id(_query_id) + .instance_id(_runtime_state->fragment_instance_id()) + .tag("mem_limit", std::to_string(mem_limit)); // we need to start the profile-reporting thread before calling Open(), since it // may block @@ -262,6 +266,12 @@ Status PlanFragmentExecutor::open() { _runtime_state->log_error(status.get_error_msg()); } + if (status.is_cancelled()) { + if (_cancel_reason == PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED) { + status = Status::MemoryLimitExceeded(_cancel_msg); + } + } + update_status(status); return status; } @@ -444,11 +454,15 @@ void PlanFragmentExecutor::_collect_query_statistics() { void PlanFragmentExecutor::_collect_node_statistics() { DCHECK(_runtime_state->backend_id() != -1); - NodeStatistics* node_statistics = _query_statistics->add_nodes_statistics(_runtime_state->backend_id()); - node_statistics->add_peak_memory(_mem_tracker->peak_consumption()); + NodeStatistics* node_statistics = + _query_statistics->add_nodes_statistics(_runtime_state->backend_id()); + node_statistics->add_peak_memory(_runtime_state->instance_mem_tracker()->peak_consumption()); } void PlanFragmentExecutor::report_profile() { + SCOPED_ATTACH_TASK_THREAD_4ARG( + _runtime_state->query_type(), print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), _runtime_state->instance_mem_tracker()); VLOG_FILE << "report_profile(): instance_id=" << _runtime_state->fragment_instance_id(); DCHECK(_report_status_cb); @@ -458,7 +472,6 @@ void PlanFragmentExecutor::report_profile() { // tell Open() that we started _report_thread_started_cv.notify_one(); - // Jitter the reporting time of remote fragments by a random amount between // 0 and the report_interval. This way, the coordinator doesn't get all the // updates at once so its better for contention as well as smoother progress @@ -552,8 +565,10 @@ Status PlanFragmentExecutor::get_next(RowBatch** batch) { update_status(status); if (_done) { - TAG(LOG(INFO)).log("PlanFragmentExecutor::get_next finished") - .query_id(_query_id).instance_id(_runtime_state->fragment_instance_id()); + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::get_next finished") + .query_id(_query_id) + .instance_id(_runtime_state->fragment_instance_id()); // Query is done, return the thread token stop_report_thread(); send_report(true); @@ -598,7 +613,7 @@ void PlanFragmentExecutor::update_status(const Status& new_status) { _runtime_state->set_mem_limit_exceeded(new_status.get_error_msg()); } _status = new_status; - if (_runtime_state->query_options().query_type == TQueryType::EXTERNAL) { + if (_runtime_state->query_type() == TQueryType::EXTERNAL) { TUniqueId fragment_instance_id = _runtime_state->fragment_instance_id(); _exec_env->result_queue_mgr()->update_queue_status(fragment_instance_id, new_status); @@ -610,10 +625,14 @@ void PlanFragmentExecutor::update_status(const Status& new_status) { send_report(true); } -void PlanFragmentExecutor::cancel() { - TAG(LOG(INFO)).log("PlanFragmentExecutor::cancel") - .query_id(_query_id).instance_id(_runtime_state->fragment_instance_id()); +void PlanFragmentExecutor::cancel(const PPlanFragmentCancelReason& reason, const std::string& msg) { + TAG(LOG(INFO)) + .log("PlanFragmentExecutor::cancel") + .query_id(_query_id) + .instance_id(_runtime_state->fragment_instance_id()); DCHECK(_prepared); + _cancel_reason = reason; + _cancel_msg = msg; _runtime_state->set_is_cancelled(true); // must close stream_mgr to avoid dead lock in Exchange Node @@ -680,10 +699,6 @@ void PlanFragmentExecutor::close() { << print_id(_runtime_state->fragment_instance_id()); } - // _mem_tracker init failed - if (_mem_tracker.get() != nullptr) { - _mem_tracker->Release(_mem_tracker->consumption()); - } _closed = true; } diff --git a/be/src/runtime/plan_fragment_executor.h b/be/src/runtime/plan_fragment_executor.h index 3cdb6bb2495f50..12201d6d71da45 100644 --- a/be/src/runtime/plan_fragment_executor.h +++ b/be/src/runtime/plan_fragment_executor.h @@ -24,6 +24,7 @@ #include "common/object_pool.h" #include "common/status.h" +#include "gen_cpp/internal_service.pb.h" #include "runtime/datetime_value.h" #include "runtime/query_fragments_ctx.h" #include "runtime/query_statistics.h" @@ -127,7 +128,8 @@ class PlanFragmentExecutor { void set_abort(); // Initiate cancellation. Must not be called until after prepare() returned. - void cancel(); + void cancel(const PPlanFragmentCancelReason& reason = PPlanFragmentCancelReason::INTERNAL_ERROR, + const std::string& msg = ""); // call these only after prepare() RuntimeState* runtime_state() { return _runtime_state.get(); } @@ -146,7 +148,6 @@ class PlanFragmentExecutor { ExecEnv* _exec_env; // not owned ExecNode* _plan; // lives in _runtime_state->obj_pool() TUniqueId _query_id; - std::shared_ptr _mem_tracker; // profile reporting-related report_status_callback _report_status_cb; @@ -177,6 +178,10 @@ class PlanFragmentExecutor { // This executor will not report status to FE on being cancelled. bool _is_report_on_cancel; + // Record the cancel information when calling the cancel() method, return it to FE + PPlanFragmentCancelReason _cancel_reason; + std::string _cancel_msg; + // Overall execution status. Either ok() or set to the first error status that // was encountered. Status _status; diff --git a/be/src/runtime/qsorter.cpp b/be/src/runtime/qsorter.cpp index 951b35a63afaac..cec1b6cd2e3ad6 100644 --- a/be/src/runtime/qsorter.cpp +++ b/be/src/runtime/qsorter.cpp @@ -81,7 +81,7 @@ QSorter::QSorter(const RowDescriptor& row_desc, const std::vector& RuntimeState* state) : _row_desc(row_desc), _order_expr_ctxs(order_expr_ctxs), - _tuple_pool(new MemPool(state->instance_mem_tracker().get())) {} + _tuple_pool(new MemPool("QSorter")) {} Status QSorter::prepare(RuntimeState* state) { RETURN_IF_ERROR(Expr::clone_if_not_exists(_order_expr_ctxs, state, &_lhs_expr_ctxs)); diff --git a/be/src/runtime/result_file_sink.cpp b/be/src/runtime/result_file_sink.cpp index efe367cda6931f..878f8fe67890a7 100644 --- a/be/src/runtime/result_file_sink.cpp +++ b/be/src/runtime/result_file_sink.cpp @@ -22,7 +22,6 @@ #include "runtime/buffer_control_block.h" #include "runtime/exec_env.h" #include "runtime/file_result_writer.h" -#include "runtime/mem_tracker.h" #include "runtime/mysql_result_writer.h" #include "runtime/result_buffer_mgr.h" #include "runtime/row_batch.h" @@ -110,11 +109,8 @@ Status ResultFileSink::prepare(RuntimeState* state) { _local_bytes_send_counter = ADD_COUNTER(profile(), "LocalBytesSent", TUnit::BYTES); _uncompressed_bytes_counter = ADD_COUNTER(profile(), "UncompressedRowBatchSize", TUnit::BYTES); - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "ResultFileSink:" + print_id(state->fragment_instance_id()), - state->instance_mem_tracker()); // create writer - _output_batch = new RowBatch(_output_row_descriptor, 1024, _mem_tracker.get()); + _output_batch = new RowBatch(_output_row_descriptor, 1024); _writer.reset(new (std::nothrow) FileResultWriter( _file_opts.get(), _storage_type, state->fragment_instance_id(), _output_expr_ctxs, _profile, nullptr, _output_batch, state->return_object_data_as_binary())); diff --git a/be/src/runtime/result_file_sink.h b/be/src/runtime/result_file_sink.h index cef47cc10a8577..60a1ae2f03c238 100644 --- a/be/src/runtime/result_file_sink.h +++ b/be/src/runtime/result_file_sink.h @@ -34,7 +34,6 @@ class RuntimeProfile; class BufferControlBlock; class ExprContext; class ResultWriter; -class MemTracker; class ResultFileOptions; class ResultFileSink : public DataStreamSender { diff --git a/be/src/runtime/result_sink.cpp b/be/src/runtime/result_sink.cpp index 610f105074c5bb..b83ae8af3b0f4e 100644 --- a/be/src/runtime/result_sink.cpp +++ b/be/src/runtime/result_sink.cpp @@ -23,6 +23,7 @@ #include "runtime/exec_env.h" #include "runtime/file_result_writer.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "runtime/mysql_result_writer.h" #include "runtime/result_buffer_mgr.h" #include "runtime/row_batch.h" @@ -100,6 +101,10 @@ Status ResultSink::open(RuntimeState* state) { } Status ResultSink::send(RuntimeState* state, RowBatch* batch) { + // The memory consumption in the process of sending the results is not recorded in the query memory. + // 1. Avoid the query being cancelled when the memory limit is reached after the query result comes out. + // 2. If record this memory, also need to record on the receiving end, need to consider the life cycle of MemTracker. + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); return _writer->append_row_batch(batch); } diff --git a/be/src/runtime/result_sink.h b/be/src/runtime/result_sink.h index 08fd6338c38f87..5368c8bcd8d3fa 100644 --- a/be/src/runtime/result_sink.h +++ b/be/src/runtime/result_sink.h @@ -33,7 +33,6 @@ class RuntimeProfile; class BufferControlBlock; class ExprContext; class ResultWriter; -class MemTracker; class ResultFileOptions; namespace vectorized { diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp index eb223d661d5d7e..7bb1795caa5362 100644 --- a/be/src/runtime/row_batch.cpp +++ b/be/src/runtime/row_batch.cpp @@ -27,6 +27,7 @@ #include "runtime/collection_value.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" @@ -40,8 +41,8 @@ namespace doris { const int RowBatch::AT_CAPACITY_MEM_USAGE = 8 * 1024 * 1024; const int RowBatch::FIXED_LEN_BUFFER_LIMIT = AT_CAPACITY_MEM_USAGE / 2; -RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_tracker) - : _mem_tracker(mem_tracker), +RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity) + : _mem_tracker(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()), _has_in_flight_row(false), _num_rows(0), _num_uncommitted_rows(0), @@ -52,14 +53,14 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_ _row_desc(row_desc), _auxiliary_mem_usage(0), _need_to_return(false), - _tuple_data_pool(_mem_tracker) { - DCHECK(_mem_tracker != nullptr); + _tuple_data_pool() { DCHECK_GT(capacity, 0); _tuple_ptrs_size = _capacity * _num_tuples_per_row * sizeof(Tuple*); DCHECK_GT(_tuple_ptrs_size, 0); // TODO: switch to Init() pattern so we can check memory limit and return Status. if (config::enable_partitioned_aggregation) { - _mem_tracker->Consume(_tuple_ptrs_size); + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + _mem_tracker->consume(_tuple_ptrs_size); _tuple_ptrs = (Tuple**)(malloc(_tuple_ptrs_size)); DCHECK(_tuple_ptrs != nullptr); } else { @@ -73,8 +74,8 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_ // xfer += iprot->readString(this->tuple_data[_i9]); // to allocated string data in special mempool // (change via python script that runs over Data_types.cc) -RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, MemTracker* tracker) - : _mem_tracker(tracker), +RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch) + : _mem_tracker(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()), _has_in_flight_row(false), _num_rows(input_batch.num_rows()), _num_uncommitted_rows(0), @@ -85,13 +86,13 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, _row_desc(row_desc), _auxiliary_mem_usage(0), _need_to_return(false), - _tuple_data_pool(_mem_tracker) { - DCHECK(_mem_tracker != nullptr); + _tuple_data_pool() { _tuple_ptrs_size = _num_rows * _num_tuples_per_row * sizeof(Tuple*); DCHECK_GT(_tuple_ptrs_size, 0); // TODO: switch to Init() pattern so we can check memory limit and return Status. if (config::enable_partitioned_aggregation) { - _mem_tracker->Consume(_tuple_ptrs_size); + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); + _mem_tracker->consume(_tuple_ptrs_size); _tuple_ptrs = (Tuple**)(malloc(_tuple_ptrs_size)); DCHECK(_tuple_ptrs != nullptr); } else { @@ -235,9 +236,10 @@ void RowBatch::clear() { _blocks[i]->del(); } if (config::enable_partitioned_aggregation) { + SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER(); DCHECK(_tuple_ptrs != nullptr); free(_tuple_ptrs); - _mem_tracker->Release(_tuple_ptrs_size); + _mem_tracker->release(_tuple_ptrs_size); _tuple_ptrs = nullptr; } _cleared = true; @@ -353,7 +355,7 @@ void RowBatch::add_io_buffer(DiskIoMgr::BufferDescriptor* buffer) { DCHECK(buffer != nullptr); _io_buffers.push_back(buffer); _auxiliary_mem_usage += buffer->buffer_len(); - buffer->set_mem_tracker(std::shared_ptr(_mem_tracker)); // TODO(yingchun): fixme + buffer->set_mem_tracker(_mem_tracker); } Status RowBatch::resize_and_allocate_tuple_buffer(RuntimeState* state, int64_t* tuple_buffer_size, @@ -435,8 +437,7 @@ void RowBatch::transfer_resource_ownership(RowBatch* dest) { DiskIoMgr::BufferDescriptor* buffer = _io_buffers[i]; dest->_io_buffers.push_back(buffer); dest->_auxiliary_mem_usage += buffer->buffer_len(); - buffer->set_mem_tracker( - std::shared_ptr(dest->_mem_tracker)); // TODO(yingchun): fixme + buffer->set_mem_tracker(dest->_mem_tracker); } _io_buffers.clear(); @@ -545,7 +546,7 @@ void RowBatch::acquire_state(RowBatch* src) { DiskIoMgr::BufferDescriptor* buffer = src->_io_buffers[i]; _io_buffers.push_back(buffer); _auxiliary_mem_usage += buffer->buffer_len(); - buffer->set_mem_tracker(std::shared_ptr(_mem_tracker)); // TODO(yingchun): fixme + buffer->set_mem_tracker(_mem_tracker); } src->_io_buffers.clear(); src->_auxiliary_mem_usage = 0; diff --git a/be/src/runtime/row_batch.h b/be/src/runtime/row_batch.h index 070a1e578fb4e5..5674c1f541fa72 100644 --- a/be/src/runtime/row_batch.h +++ b/be/src/runtime/row_batch.h @@ -83,14 +83,14 @@ class RowBatch : public RowBatchInterface { // Create RowBatch for a maximum of 'capacity' rows of tuples specified // by 'row_desc'. - RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_tracker); + RowBatch(const RowDescriptor& row_desc, int capacity); // Populate a row batch from input_batch by copying input_batch's // tuple_data into the row batch's mempool and converting all offsets // in the data back into pointers. // TODO: figure out how to transfer the data from input_batch to this RowBatch // (so that we don't need to make yet another copy) - RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch, MemTracker* tracker); + RowBatch(const RowDescriptor& row_desc, const PRowBatch& input_batch); // Releases all resources accumulated at this row batch. This includes // - tuple_ptrs @@ -394,7 +394,7 @@ class RowBatch : public RowBatchInterface { std::string to_string(); private: - MemTracker* _mem_tracker; // not owned + std::shared_ptr _mem_tracker; // not owned // Close owned tuple streams and delete if needed. void close_tuple_streams(); diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index b5302aeaceb89e..cfadb41133e84c 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -27,6 +27,7 @@ #include "runtime/plan_fragment_executor.h" #include "runtime/runtime_filter_mgr.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "service/brpc.h" #include "util/brpc_client_cache.h" #include "util/time.h" @@ -46,13 +47,15 @@ RuntimeFilterMgr::RuntimeFilterMgr(const UniqueId& query_id, RuntimeState* state RuntimeFilterMgr::~RuntimeFilterMgr() {} Status RuntimeFilterMgr::init() { - DCHECK(_state->instance_mem_tracker().get() != nullptr); - _tracker = _state->instance_mem_tracker().get(); + DCHECK(_state->instance_mem_tracker() != nullptr); + _tracker = MemTracker::create_tracker(-1, "RuntimeFilterMgr", _state->instance_mem_tracker(), + MemTrackerLevel::TASK); return Status::OK(); } Status RuntimeFilterMgr::get_filter_by_role(const int filter_id, const RuntimeFilterRole role, IRuntimeFilter** target) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_tracker); int32_t key = filter_id; std::map* filter_map = nullptr; @@ -84,6 +87,7 @@ Status RuntimeFilterMgr::regist_filter(const RuntimeFilterRole role, const TRunt const TQueryOptions& options, int node_id) { DCHECK((role == RuntimeFilterRole::CONSUMER && node_id >= 0) || role != RuntimeFilterRole::CONSUMER); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_tracker); int32_t key = desc.filter_id; std::map* filter_map = nullptr; @@ -102,8 +106,8 @@ Status RuntimeFilterMgr::regist_filter(const RuntimeFilterRole role, const TRunt RuntimeFilterMgrVal filter_mgr_val; filter_mgr_val.role = role; - RETURN_IF_ERROR(IRuntimeFilter::create(_state, _tracker, &_pool, &desc, &options, - role, node_id, &filter_mgr_val.filter)); + RETURN_IF_ERROR(IRuntimeFilter::create(_state, &_pool, &desc, &options, role, node_id, + &filter_mgr_val.filter)); filter_map->emplace(key, filter_mgr_val); @@ -111,6 +115,7 @@ Status RuntimeFilterMgr::regist_filter(const RuntimeFilterRole role, const TRunt } Status RuntimeFilterMgr::update_filter(const PPublishFilterRequest* request, const char* data) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_tracker); UpdateRuntimeFilterParams params; params.request = request; params.data = data; @@ -137,26 +142,25 @@ Status RuntimeFilterMgr::get_merge_addr(TNetworkAddress* addr) { } Status RuntimeFilterMergeControllerEntity::_init_with_desc( - const TRuntimeFilterDesc* runtime_filter_desc, - const TQueryOptions* query_options, + const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, const std::vector* target_info, const int producer_size) { std::lock_guard guard(_filter_map_mutex); std::shared_ptr cntVal = std::make_shared(); // runtime_filter_desc and target will be released, // so we need to copy to cntVal - // TODO: tracker should add a name cntVal->producer_size = producer_size; cntVal->runtime_filter_desc = *runtime_filter_desc; cntVal->target_info = *target_info; cntVal->pool.reset(new ObjectPool()); - cntVal->tracker = MemTracker::CreateTracker(); - cntVal->filter = cntVal->pool->add( - new IRuntimeFilter(nullptr, cntVal->tracker.get(), cntVal->pool.get())); + cntVal->filter = cntVal->pool->add(new IRuntimeFilter(nullptr, cntVal->pool.get())); std::string filter_id = std::to_string(runtime_filter_desc->filter_id); // LOG(INFO) << "entity filter id:" << filter_id; - cntVal->filter->init_with_desc(&cntVal->runtime_filter_desc, query_options, _fragment_instance_id); + cntVal->filter->init_with_desc(&cntVal->runtime_filter_desc, query_options, + _fragment_instance_id); + cntVal->_tracker = MemTracker::create_tracker( + -1, thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->label() + ":FilterID:" + filter_id); _filter_map.emplace(filter_id, cntVal); return Status::OK(); } @@ -166,6 +170,9 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, UniqueId frag const TQueryOptions& query_options) { _query_id = query_id; _fragment_instance_id = fragment_instance_id; + _mem_tracker = MemTracker::create_tracker(-1, "RuntimeFilterMergeControllerEntity", nullptr, + MemTrackerLevel::INSTANCE); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (auto& filterid_to_desc : runtime_filter_params.rid_to_runtime_filter) { int filter_id = filterid_to_desc.first; const auto& target_iter = runtime_filter_params.rid_to_target_param.find(filter_id); @@ -176,7 +183,8 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, UniqueId frag if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { return Status::InternalError("runtime filter params meet error"); } - _init_with_desc(&filterid_to_desc.second, &query_options, &target_iter->second, build_iter->second); + _init_with_desc(&filterid_to_desc.second, &query_options, &target_iter->second, + build_iter->second); } return Status::OK(); } @@ -184,6 +192,7 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, UniqueId frag // merge data Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* request, const char* data) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::shared_ptr cntVal; int merged_size = 0; { @@ -195,14 +204,13 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ return Status::InvalidArgument("unknown filter id"); } cntVal = iter->second; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(cntVal->_tracker); MergeRuntimeFilterParams params; params.data = data; params.request = request; - std::shared_ptr tracker = iter->second->tracker; ObjectPool* pool = iter->second->pool.get(); RuntimeFilterWrapperHolder holder; - RETURN_IF_ERROR( - IRuntimeFilter::create_wrapper(¶ms, tracker.get(), pool, holder.getHandle())); + RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, pool, holder.getHandle())); RETURN_IF_ERROR(cntVal->filter->merge_from(holder.getHandle()->get())); cntVal->arrive_id.insert(UniqueId(request->fragment_id()).to_string()); merged_size = cntVal->arrive_id.size(); diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index 653ce675b2356a..ec6740673821a9 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -91,7 +91,7 @@ class RuntimeFilterMgr { std::map _producer_map; RuntimeState* _state; - MemTracker* _tracker; + std::shared_ptr _tracker; ObjectPool _pool; TNetworkAddress _merge_addr; @@ -130,13 +130,14 @@ class RuntimeFilterMergeControllerEntity { std::vector target_info; IRuntimeFilter* filter; std::unordered_set arrive_id; // fragment_instance_id ? - std::shared_ptr tracker; + std::shared_ptr _tracker; std::shared_ptr pool; }; UniqueId _query_id; UniqueId _fragment_instance_id; // protect _filter_map std::mutex _filter_map_mutex; + std::shared_ptr _mem_tracker; // TODO: convert filter id to i32 // filter-id -> val std::map> _filter_map; diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index e506189383a241..9051011eff422c 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -36,7 +36,9 @@ #include "runtime/initial_reservations.h" #include "runtime/load_path_mgr.h" #include "runtime/mem_tracker.h" +#include "runtime/mem_tracker_task_pool.h" #include "runtime/runtime_filter_mgr.h" +#include "runtime/thread_context.h" #include "util/cpu_info.h" #include "util/disk_info.h" #include "util/file_utils.h" @@ -52,8 +54,7 @@ namespace doris { RuntimeState::RuntimeState(const TUniqueId& fragment_instance_id, const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env) - : _fragment_mem_tracker(nullptr), - _profile("Fragment " + print_id(fragment_instance_id)), + : _profile("Fragment " + print_id(fragment_instance_id)), _obj_pool(new ObjectPool()), _runtime_filter_mgr(new RuntimeFilterMgr(TUniqueId(), this)), _data_stream_recvrs_pool(new ObjectPool()), @@ -79,8 +80,7 @@ RuntimeState::RuntimeState(const TUniqueId& fragment_instance_id, RuntimeState::RuntimeState(const TPlanFragmentExecParams& fragment_exec_params, const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env) - : _fragment_mem_tracker(nullptr), - _profile("Fragment " + print_id(fragment_exec_params.fragment_instance_id)), + : _profile("Fragment " + print_id(fragment_exec_params.fragment_instance_id)), _obj_pool(new ObjectPool()), _runtime_filter_mgr(new RuntimeFilterMgr(fragment_exec_params.query_id, this)), _data_stream_recvrs_pool(new ObjectPool()), @@ -206,42 +206,33 @@ Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOpt Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { bool has_query_mem_tracker = _query_options.__isset.mem_limit && (_query_options.mem_limit > 0); int64_t bytes_limit = has_query_mem_tracker ? _query_options.mem_limit : -1; - // we do not use global query-map for now, to avoid mem-exceeded different fragments - // running on the same machine. - // TODO(lingbin): open it later. note that open with BufferedBlockMgr's BlockMgrsMap - // at the same time. - - // _query_mem_tracker = MemTracker::get_query_mem_tracker( - // query_id, bytes_limit, _exec_env->process_mem_tracker()); - auto mem_tracker_counter = ADD_COUNTER(&_profile, "MemoryLimit", TUnit::BYTES); mem_tracker_counter->set(bytes_limit); - _query_mem_tracker = - MemTracker::CreateTracker(bytes_limit, "RuntimeState:query:" + print_id(query_id), - _exec_env->process_mem_tracker(), true, false); - _instance_mem_tracker = - MemTracker::CreateTracker(&_profile, -1, "RuntimeState:instance:", _query_mem_tracker); - - /* - // TODO: this is a stopgap until we implement ExprContext - _udf_mem_tracker.reset( - new MemTracker(-1, "UDFs", _instance_mem_tracker.get())); - _udf_pool.reset(new MemPool(_udf_mem_tracker.get())); - */ - // _udf_pool.reset(new MemPool(_instance_mem_tracker.get())); + if (query_type() == TQueryType::SELECT) { + _query_mem_tracker = _exec_env->task_pool_mem_tracker_registry()->register_query_mem_tracker( + print_id(query_id), bytes_limit); + } else if (query_type() == TQueryType::LOAD) { + _query_mem_tracker = _exec_env->task_pool_mem_tracker_registry()->register_load_mem_tracker( + print_id(query_id), bytes_limit); + } else { + DCHECK(false); + } + + _instance_mem_tracker = MemTracker::create_tracker( + -1, "RuntimeState:instance:" + print_id(_fragment_instance_id), _query_mem_tracker, + MemTrackerLevel::INSTANCE, &_profile); RETURN_IF_ERROR(init_buffer_poolstate()); _initial_reservations = _obj_pool->add( - new InitialReservations(_obj_pool.get(), _buffer_reservation, _query_mem_tracker, + new InitialReservations(_obj_pool.get(), _buffer_reservation, nullptr, _query_options.initial_reservation_total_claims)); RETURN_IF_ERROR(_initial_reservations->Init(_query_id, min_reservation())); DCHECK_EQ(0, _initial_reservation_refcnt.load()); if (_instance_buffer_reservation != nullptr) { - _instance_buffer_reservation->InitChildTracker(&_profile, _buffer_reservation, - _instance_mem_tracker.get(), + _instance_buffer_reservation->InitChildTracker(&_profile, _buffer_reservation, nullptr, std::numeric_limits::max()); } @@ -251,13 +242,13 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { } Status RuntimeState::init_instance_mem_tracker() { - _instance_mem_tracker = MemTracker::CreateTracker(-1, "RuntimeState"); + _instance_mem_tracker = MemTracker::create_tracker(-1, "RuntimeState"); return Status::OK(); } Status RuntimeState::init_buffer_poolstate() { ExecEnv* exec_env = ExecEnv::GetInstance(); - int64_t mem_limit = _query_mem_tracker->GetLowestLimit(MemLimit::HARD); + int64_t mem_limit = _query_mem_tracker->get_lowest_limit(); int64_t max_reservation; if (query_options().__isset.buffer_pool_limit && query_options().buffer_pool_limit > 0) { max_reservation = query_options().buffer_pool_limit; @@ -273,8 +264,8 @@ Status RuntimeState::init_buffer_poolstate() { VLOG_QUERY << "Buffer pool limit for " << print_id(_query_id) << ": " << max_reservation; _buffer_reservation = _obj_pool->add(new ReservationTracker); - _buffer_reservation->InitChildTracker(nullptr, exec_env->buffer_reservation(), - _query_mem_tracker.get(), max_reservation); + _buffer_reservation->InitChildTracker(nullptr, exec_env->buffer_reservation(), nullptr, + max_reservation); return Status::OK(); } @@ -287,7 +278,7 @@ Status RuntimeState::create_block_mgr() { block_mgr_limit = std::numeric_limits::max(); } RETURN_IF_ERROR(BufferedBlockMgr2::create( - this, _query_mem_tracker, runtime_profile(), _exec_env->tmp_file_mgr(), block_mgr_limit, + this, runtime_profile(), _exec_env->tmp_file_mgr(), block_mgr_limit, _exec_env->disk_io_mgr()->max_read_buffer_size(), &_block_mgr2)); return Status::OK(); } @@ -330,46 +321,13 @@ void RuntimeState::get_unreported_errors(std::vector* new_errors) { } } -Status RuntimeState::set_mem_limit_exceeded(MemTracker* tracker, int64_t failed_allocation_size, - const std::string* msg) { - DCHECK_GE(failed_allocation_size, 0); +Status RuntimeState::set_mem_limit_exceeded(const std::string& msg) { { std::lock_guard l(_process_status_lock); if (_process_status.ok()) { - if (msg != nullptr) { - _process_status = Status::MemoryLimitExceeded(*msg); - } else { - _process_status = Status::MemoryLimitExceeded("Memory limit exceeded"); - } - } else { - return _process_status; + _process_status = Status::MemoryLimitExceeded(msg); } } - - DCHECK(_query_mem_tracker.get() != nullptr); - std::stringstream ss; - ss << "Memory Limit Exceeded\n"; - if (failed_allocation_size != 0) { - DCHECK(tracker != nullptr); - ss << " " << tracker->label() << " could not allocate " - << PrettyPrinter::print(failed_allocation_size, TUnit::BYTES) - << " without exceeding limit." << std::endl; - } - - // if (_exec_env->process_mem_tracker()->LimitExceeded()) { - // ss << _exec_env->process_mem_tracker()->LogUsage(); - // } else { - // ss << _query_mem_tracker->LogUsage(); - // } - // log_error(ErrorMsg(TErrorCode::GENERAL, ss.str())); - log_error(ss.str()); - // Add warning about missing stats except for compute stats child queries. - // if (!query_ctx().__isset.parent_query_id && - // query_ctx().__isset.tables_missing_stats && - // !query_ctx().tables_missing_stats.empty()) { - // LogError(ErrorMsg(TErrorCode::GENERAL, - // GetTablesMissingStatsWarning(query_ctx().tables_missing_stats))); - // } DCHECK(_process_status.is_mem_limit_exceeded()); return _process_status; } @@ -377,7 +335,7 @@ Status RuntimeState::set_mem_limit_exceeded(MemTracker* tracker, int64_t failed_ Status RuntimeState::check_query_state(const std::string& msg) { // TODO: it would be nice if this also checked for cancellation, but doing so breaks // cases where we use Status::Cancelled("Cancelled") to indicate that the limit was reached. - RETURN_IF_LIMIT_EXCEEDED(this, msg); + RETURN_IF_LIMIT_EXCEEDED(_instance_mem_tracker, this, msg); return query_status(); } @@ -421,7 +379,7 @@ Status RuntimeState::create_error_log_file() { Status RuntimeState::append_error_msg_to_file(std::function line, std::function error_msg, bool* stop_processing, bool is_summary) { *stop_processing = false; - if (_query_options.query_type != TQueryType::LOAD) { + if (query_type() != TQueryType::LOAD) { return Status::OK(); } // If file havn't been opened, open it here @@ -493,12 +451,6 @@ void RuntimeState::export_load_error(const std::string& err_msg) { } } -// TODO chenhao , check scratch_limit, disable_spilling and file_group -// before spillng -Status RuntimeState::StartSpilling(MemTracker* mem_tracker) { - return Status::InternalError("Mem limit exceeded."); -} - int64_t RuntimeState::get_load_mem_limit() { if (_query_options.__isset.load_mem_limit && _query_options.load_mem_limit > 0) { return _query_options.load_mem_limit; diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 449b4c2a1738ac..e058706928a086 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -84,7 +84,6 @@ class RuntimeState { // The instance tracker is tied to our profile. // Specific parts of the fragment (i.e. exec nodes, sinks, data stream senders, etc) // will add a fourth level when they are initialized. - // This function also initializes a user function mem tracker (in the fourth level). Status init_mem_trackers(const TUniqueId& query_id); // for ut only @@ -113,6 +112,7 @@ class RuntimeState { int max_errors() const { return _query_options.max_errors; } int max_io_buffers() const { return _query_options.max_io_buffers; } int num_scanner_threads() const { return _query_options.num_scanner_threads; } + TQueryType::type query_type() const { return _query_options.query_type; } int64_t timestamp_ms() const { return _timestamp_ms; } const std::string& timezone() const { return _timezone; } const cctz::time_zone& timezone_obj() const { return _timezone_obj; } @@ -121,8 +121,6 @@ class RuntimeState { const TUniqueId& query_id() const { return _query_id; } const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } ExecEnv* exec_env() { return _exec_env; } - const std::vector>& mem_trackers() { return _mem_trackers; } - std::shared_ptr fragment_mem_tracker() { return _fragment_mem_tracker; } std::shared_ptr query_mem_tracker() { return _query_mem_tracker; } std::shared_ptr instance_mem_tracker() { return _instance_mem_tracker; } ThreadResourceMgr::ResourcePool* resource_pool() { return _resource_pool; } @@ -158,22 +156,6 @@ class RuntimeState { return _process_status; }; - // MemPool* udf_pool() { - // return _udf_pool.get(); - // }; - - // Create and return a stream receiver for _fragment_instance_id - // from the data stream manager. The receiver is added to _data_stream_recvrs_pool. - DataStreamRecvr* create_recvr(const RowDescriptor& row_desc, PlanNodeId dest_node_id, - int num_senders, int buffer_size, RuntimeProfile* profile); - - // Sets the fragment memory limit and adds it to _mem_trackers - void set_fragment_mem_tracker(std::shared_ptr tracker) { - DCHECK(_fragment_mem_tracker == nullptr); - _fragment_mem_tracker = tracker; - _mem_trackers.push_back(tracker); - } - // Appends error to the _error_log if there is space bool log_error(const std::string& error); @@ -226,19 +208,11 @@ class RuntimeState { _process_status = status; } - // Sets query_status_ to MEM_LIMIT_EXCEEDED and logs all the registered trackers. - // Subsequent calls to this will be no-ops. Returns query_status_. - // If 'failed_allocation_size' is not 0, then it is the size of the allocation (in - // bytes) that would have exceeded the limit allocated for 'tracker'. - // This value and tracker are only used for error reporting. + // Sets _process_status to MEM_LIMIT_EXCEEDED. + // Subsequent calls to this will be no-ops. Returns _process_status. // If 'msg' is non-nullptr, it will be appended to query_status_ in addition to the // generic "Memory limit exceeded" error. - Status set_mem_limit_exceeded(MemTracker* tracker = nullptr, int64_t failed_allocation_size = 0, - const std::string* msg = nullptr); - - Status set_mem_limit_exceeded(const std::string& msg) { - return set_mem_limit_exceeded(nullptr, 0, &msg); - } + Status set_mem_limit_exceeded(const std::string& msg = "Memory limit exceeded"); // Returns a non-OK status if query execution should stop (e.g., the query was cancelled // or a mem limit was exceeded). Exec nodes should check this periodically so execution @@ -397,12 +371,6 @@ class RuntimeState { static const int DEFAULT_BATCH_SIZE = 2048; - // all mem limits that apply to this query - std::vector> _mem_trackers; - - // Fragment memory limit. Also contained in _mem_trackers - std::shared_ptr _fragment_mem_tracker; - // MemTracker that is shared by all fragment instances running on this host. // The query mem tracker must be released after the _instance_mem_tracker. std::shared_ptr _query_mem_tracker; diff --git a/be/src/runtime/sorted_run_merger.cc b/be/src/runtime/sorted_run_merger.cc index 28baab462e8105..e4a15c56a1cd53 100644 --- a/be/src/runtime/sorted_run_merger.cc +++ b/be/src/runtime/sorted_run_merger.cc @@ -178,6 +178,7 @@ class SortedRunMerger::ParallelBatchedRowSupplier : public SortedRunMerger::Batc std::condition_variable _batch_prepared_cv; void process_sorted_run_task() { + // TODO(zxy) Whether to attach mem tracker std::unique_lock lock(_mutex); while (true) { _batch_prepared_cv.wait(lock, [this]() { return !_backup_ready.load(); }); @@ -307,11 +308,9 @@ Status SortedRunMerger::get_next(RowBatch* output_batch, bool* eos) { ChildSortedRunMerger::ChildSortedRunMerger(const TupleRowComparator& compare_less_than, RowDescriptor* row_desc, RuntimeProfile* profile, - MemTracker* parent, uint32_t row_batch_size, - bool deep_copy_input) + uint32_t row_batch_size, bool deep_copy_input) : SortedRunMerger(compare_less_than, row_desc, profile, deep_copy_input), _eos(false), - _parent(parent), _row_batch_size(row_batch_size) { _get_next_timer = ADD_TIMER(profile, "ChildMergeGetNext"); _get_next_batch_timer = ADD_TIMER(profile, "ChildMergeGetNextBatch"); @@ -323,7 +322,7 @@ Status ChildSortedRunMerger::get_batch(RowBatch** output_batch) { return Status::OK(); } - _current_row_batch.reset(new RowBatch(*_input_row_desc, _row_batch_size, _parent)); + _current_row_batch.reset(new RowBatch(*_input_row_desc, _row_batch_size)); bool eos = false; RETURN_IF_ERROR(get_next(_current_row_batch.get(), &eos)); diff --git a/be/src/runtime/sorted_run_merger.h b/be/src/runtime/sorted_run_merger.h index b73cdc1b0cbbe7..c448ac8b0cafed 100644 --- a/be/src/runtime/sorted_run_merger.h +++ b/be/src/runtime/sorted_run_merger.h @@ -109,8 +109,7 @@ class SortedRunMerger { class ChildSortedRunMerger : public SortedRunMerger { public: ChildSortedRunMerger(const TupleRowComparator& compare_less_than, RowDescriptor* row_desc, - RuntimeProfile* profile, MemTracker* _parent, uint32_t row_batch_size, - bool deep_copy_input); + RuntimeProfile* profile, uint32_t row_batch_size, bool deep_copy_input); Status get_batch(RowBatch** output_batch) override; @@ -121,8 +120,6 @@ class ChildSortedRunMerger : public SortedRunMerger { // The data in merger is exhaust bool _eos = false; - MemTracker* _parent; - uint32_t _row_batch_size; }; diff --git a/be/src/runtime/spill_sorter.cc b/be/src/runtime/spill_sorter.cc index a461ebe7faff9a..fc9213501e258b 100644 --- a/be/src/runtime/spill_sorter.cc +++ b/be/src/runtime/spill_sorter.cc @@ -638,10 +638,7 @@ Status SpillSorter::Run::prepare_read() { _pin_next_fixed_len_block = _pin_next_var_len_block = false; _num_tuples_returned = 0; - // _buffered_batch.reset(new RowBatch(*_sorter->_output_row_desc, - // _sorter->_state->batch_size(), _sorter->_mem_tracker)); - _buffered_batch.reset(new RowBatch(*_sorter->_output_row_desc, _sorter->_state->batch_size(), - _sorter->_mem_tracker.get())); + _buffered_batch.reset(new RowBatch(*_sorter->_output_row_desc, _sorter->_state->batch_size())); // If the run is pinned, merge is not invoked, so _buffered_batch is not needed // and the individual blocks do not need to be pinned. @@ -1253,8 +1250,7 @@ Status SpillSorter::merge_intermediate_runs() { std::min(max_runs_per_intermediate_merge, _sorted_runs.size() - max_runs_per_intermediate_merge); RETURN_IF_ERROR(create_merger(num_runs_to_merge)); - RowBatch intermediate_merge_batch(*_output_row_desc, _state->batch_size(), - _mem_tracker.get()); + RowBatch intermediate_merge_batch(*_output_row_desc, _state->batch_size()); // merged_run is the new sorted run that is produced by the intermediate merge. Run* merged_run = _obj_pool.add(new Run(this, _output_row_desc->tuple_descriptors()[0], false)); diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index 75dc64fffdd78c..369430e89ea56b 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -21,6 +21,7 @@ #include "gutil/strings/substitute.h" #include "olap/delta_writer.h" #include "olap/memtable.h" +#include "runtime/thread_context.h" #include "runtime/row_batch.h" #include "runtime/tuple_row.h" #include "util/doris_metrics.h" @@ -32,10 +33,9 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(tablet_writer_count, MetricUnit::NOUNIT); std::atomic TabletsChannel::_s_tablet_writer_count; TabletsChannel::TabletsChannel(const TabletsChannelKey& key, - const std::shared_ptr& mem_tracker, bool is_high_priority) : _key(key), _state(kInitialized), _closed_senders(64), _is_high_priority(is_high_priority) { - _mem_tracker = MemTracker::CreateTracker(-1, "TabletsChannel", mem_tracker); + _mem_tracker = MemTracker::create_tracker(-1, "TabletsChannel:" + std::to_string(key.index_id)); static std::once_flag once_flag; std::call_once(once_flag, [] { REGISTER_HOOK_METRIC(tablet_writer_count, [&]() { return _s_tablet_writer_count.load(); }); @@ -52,6 +52,7 @@ TabletsChannel::~TabletsChannel() { } Status TabletsChannel::open(const PTabletWriterOpenRequest& request) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kOpened) { // Normal case, already open by other sender @@ -78,6 +79,7 @@ Status TabletsChannel::open(const PTabletWriterOpenRequest& request) { Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, PTabletWriterAddBatchResult* response) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(request.tablet_ids_size() == request.row_batch().num_rows()); int64_t cur_seq; { @@ -101,7 +103,7 @@ Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, } } - RowBatch row_batch(*_row_desc, request.row_batch(), _mem_tracker.get()); + RowBatch row_batch(*_row_desc, request.row_batch()); std::unordered_map /* row index */> tablet_to_rowidxs; for (int i = 0; i < request.tablet_ids_size(); ++i) { int64_t tablet_id = request.tablet_ids(i); @@ -150,6 +152,7 @@ Status TabletsChannel::add_batch(const PTabletWriterAddBatchRequest& request, Status TabletsChannel::close(int sender_id, int64_t backend_id, bool* finished, const google::protobuf::RepeatedField& partition_ids, google::protobuf::RepeatedPtrField* tablet_vec) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kFinished) { return _close_status; @@ -199,13 +202,12 @@ Status TabletsChannel::close(int sender_id, int64_t backend_id, bool* finished, // tablet_vec will only contains success tablet, and then let FE judge it. writer->close_wait(tablet_vec, (_broken_tablets.find(writer->tablet_id()) != _broken_tablets.end())); } - // TODO(gaodayue) clear and destruct all delta writers to make sure all memory are freed - // DCHECK_EQ(_mem_tracker->consumption(), 0); } return Status::OK(); } Status TabletsChannel::reduce_mem_usage(int64_t mem_limit) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kFinished) { // TabletsChannel is closed without LoadChannel's lock, @@ -261,6 +263,7 @@ Status TabletsChannel::reduce_mem_usage(int64_t mem_limit) { } Status TabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector* index_slots = nullptr; int32_t schema_hash = 0; for (auto& index : _schema->indexes()) { @@ -289,7 +292,7 @@ Status TabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request wrequest.is_high_priority = _is_high_priority; DeltaWriter* writer = nullptr; - auto st = DeltaWriter::open(&wrequest, _mem_tracker, &writer); + auto st = DeltaWriter::open(&wrequest, &writer); if (st != OLAP_SUCCESS) { std::stringstream ss; ss << "open delta writer failed, tablet_id=" << tablet.tablet_id() @@ -306,6 +309,7 @@ Status TabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& request } Status TabletsChannel::cancel() { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::lock_guard l(_lock); if (_state == kFinished) { return _close_status; diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h index e99ac6264b396b..226b2b76db05f5 100644 --- a/be/src/runtime/tablets_channel.h +++ b/be/src/runtime/tablets_channel.h @@ -54,7 +54,7 @@ class OlapTableSchemaParam; // Write channel for a particular (load, index). class TabletsChannel { public: - TabletsChannel(const TabletsChannelKey& key, const std::shared_ptr& mem_tracker, bool is_high_priority); + TabletsChannel(const TabletsChannelKey& key, bool is_high_priority); ~TabletsChannel(); diff --git a/be/src/runtime/tcmalloc_hook.h b/be/src/runtime/tcmalloc_hook.h new file mode 100644 index 00000000000000..ac24208e748ee6 --- /dev/null +++ b/be/src/runtime/tcmalloc_hook.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "runtime/thread_context.h" + +// Notice: modify the command in New/Delete Hook should be careful enough!, +// and should be as simple as possible, otherwise it may cause weird errors. E.g: +// 1. The first New Hook call of the process may be before some variables of +// the process are initialized. +// 2. Allocating memory in the Hook command causes the Hook to be entered again, +// infinite recursion. +// 3. TCMalloc hook will be triggered during the process of initializing/Destructor +// memtracker shared_ptr, Using the object pointed to by this memtracker shared_ptr +// in TCMalloc hook may cause crash. +// 4. Modifying additional thread local variables in ThreadContext construction and +// destructor to control the behavior of consume can lead to unexpected behavior, +// like this: if (LIKELY(doris::thread_mem_tracker_mgr_init)) { +void new_hook(const void* ptr, size_t size) { + doris::thread_local_ctx.get()->consume_mem(tc_nallocx(size, 0)); +} + +void delete_hook(const void* ptr) { + doris::thread_local_ctx.get()->release_mem(tc_malloc_size(const_cast(ptr))); +} + +void init_hook() { + MallocHook::AddNewHook(&new_hook); + MallocHook::AddDeleteHook(&delete_hook); +} + +void destroy_hook() { + MallocHook::RemoveNewHook(&new_hook); + MallocHook::RemoveDeleteHook(&delete_hook); +} diff --git a/be/src/runtime/thread_context.cpp b/be/src/runtime/thread_context.cpp new file mode 100644 index 00000000000000..8aec6898c76ae1 --- /dev/null +++ b/be/src/runtime/thread_context.cpp @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/thread_context.h" + +namespace doris { + +DEFINE_STATIC_THREAD_LOCAL(ThreadContext, ThreadContextPtr, thread_local_ctx); + +ThreadContextPtr::ThreadContextPtr() { + INIT_STATIC_THREAD_LOCAL(ThreadContext, thread_local_ctx); +} + +ThreadContext* ThreadContextPtr::get() { + return thread_local_ctx; +} + +} diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h new file mode 100644 index 00000000000000..f4550b95e99750 --- /dev/null +++ b/be/src/runtime/thread_context.h @@ -0,0 +1,316 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "runtime/thread_mem_tracker_mgr.h" +#include "runtime/threadlocal.h" + +// Attach to task when thread starts +#define SCOPED_ATTACH_TASK_THREAD_2ARG(type, mem_tracker) \ + auto VARNAME_LINENUM(attach_task_thread) = AttachTaskThread(type, mem_tracker) +#define SCOPED_ATTACH_TASK_THREAD_4ARG(query_type, task_id, fragment_instance_id, mem_tracker) \ + auto VARNAME_LINENUM(attach_task_thread) = \ + AttachTaskThread(query_type, task_id, fragment_instance_id, mem_tracker) +#define SCOPED_ATTACH_TASK_THREAD_4ARGP(query_type, task_id, fragment_instance_id, mem_tracker) \ + auto VARNAME_LINENUM(attach_task_thread) = \ + AttachTaskThreadP(query_type, task_id, fragment_instance_id, mem_tracker) +// Toggle MemTracker during thread execution +// 必须在 SCOPED_ATTACH_TASK_THREAD 中切换,否则可能导致缓存的mem tracker不被释放 +// Must-see Notes: 不能频繁 SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER 和 SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER, +// 因为变量的创建和析构顺序,以及内存的申请和释放顺序可能和指令的执行顺序不同(待进一步调研),这可能导致内存统计的tracker或位置不符合预期; +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker) \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARGP(mem_tracker) \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTrackerP(mem_tracker) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker, action_type) \ + do { \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker); \ + auto VARNAME_LINENUM(switch_tracker_cb) = SwitchThreadMemTrackerCallBack(action_type); \ + } while (false) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_3ARG(mem_tracker, action_type, cancel_work) \ + do { \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker); \ + auto VARNAME_LINENUM(switch_tracker_cb) = \ + SwitchThreadMemTrackerCallBack(action_type, cancel_work); \ + } while (false) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_4ARG(mem_tracker, action_type, cancel_work, \ + err_call_back_func) \ + do { \ + auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker); \ + auto VARNAME_LINENUM(switch_tracker_cb) = \ + SwitchThreadMemTrackerCallBack(action_type, cancel_work, err_call_back_func); \ + } while (false) +#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB(action_type) \ + auto VARNAME_LINENUM(switch_tracker_cb) = SwitchThreadMemTrackerCallBack(action_type) +#define SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER() \ + auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(true) +#define GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER() \ + auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(false) +#define CHECK_MEM_LIMIT(size) thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->check_limit(size) + +namespace doris { + +class TUniqueId; + +// The thread context saves some info about a working thread. +// 2 requried info: +// 1. thread_id: Current thread id, Auto generated. +// 2. type: The type is a enum value indicating which type of task current thread is running. +// For example: QUERY, LOAD, COMPACTION, ... +// 3. task id: A unique id to identify this task. maybe query id, load job id, etc. +// +// There may be other optional info to be added later. +class ThreadContext { +public: + enum TaskType { + UNKNOWN = 0, + QUERY = 1, + LOAD = 2, + COMPACTION = 3 + // to be added ... + }; + +public: + ThreadContext() : _thread_id(std::this_thread::get_id()), _type(TaskType::UNKNOWN) { + _thread_mem_tracker_mgr.reset(new ThreadMemTrackerMgr()); + } + + void attach(const TaskType& type, const std::string& task_id, + const TUniqueId& fragment_instance_id, std::shared_ptr mem_tracker) { + DCHECK(_type == TaskType::UNKNOWN && _task_id == ""); + _type = type; + _task_id = task_id; + _fragment_instance_id = fragment_instance_id; + _thread_mem_tracker_mgr->attach_task(get_type(), task_id, fragment_instance_id, + mem_tracker); + } + + void detach() { + _type = TaskType::UNKNOWN; + _task_id = ""; + _fragment_instance_id = TUniqueId(); + _thread_mem_tracker_mgr->detach_task(); + } + + const std::string get_type() const; + const std::string& task_id() const { return _task_id; } + const std::thread::id& thread_id() const { return _thread_id; } + const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } + + void consume_mem(int64_t size) { + if (thread_mem_tracker_mgr_init == true) { + _thread_mem_tracker_mgr->cache_consume(size); + } + } + + void release_mem(int64_t size) { + if (thread_mem_tracker_mgr_init == true) { + _thread_mem_tracker_mgr->cache_consume(-size); + } + } + + // After _thread_mem_tracker_mgr is initialized, the current thread TCMalloc Hook starts to + // consume/release mem_tracker. + // Note that the use of shared_ptr will cause a crash. The guess is that there is an + // intermediate state during the copy construction of shared_ptr. Shared_ptr is not equal + // to nullptr, but the object it points to is not initialized. At this time, when the memory + // is released somewhere, the TCMalloc hook is triggered to cause the crash. + std::unique_ptr _thread_mem_tracker_mgr; + // ThreadMemTrackerMgr* _thread_mem_tracker_mgr; + +private: + std::thread::id _thread_id; + TaskType _type; + std::string _task_id; + TUniqueId _fragment_instance_id; +}; + +// inline thread_local ThreadContext thread_local_ctx; +// inline BLOCK_STATIC_THREAD_LOCAL2(ThreadContext, thread_local_ctx); + +// static ThreadContext* load_tls() { +// thread_mem_tracker_mgr_init = false; +// BLOCK_STATIC_THREAD_LOCAL(ThreadContext, ctx2); +// thread_mem_tracker_mgr_init = true; +// return ctx2; +// } + +class ThreadContextPtr { +public: + ThreadContextPtr(); + + ThreadContext* get(); +private: + DECLARE_STATIC_THREAD_LOCAL(ThreadContext, thread_local_ctx); +}; + +inline thread_local ThreadContextPtr thread_local_ctx; + +inline const std::string task_type_string(ThreadContext::TaskType type) { + switch (type) { + case ThreadContext::TaskType::QUERY: + return "QUERY"; + case ThreadContext::TaskType::LOAD: + return "LOAD"; + case ThreadContext::TaskType::COMPACTION: + return "COMPACTION"; + default: + return "UNKNOWN"; + } +} + +inline const std::string ThreadContext::get_type() const { + return task_type_string(_type); +} + +class AttachTaskThread { +public: + explicit AttachTaskThread(const ThreadContext::TaskType& type, + const std::shared_ptr mem_tracker) { + DCHECK(mem_tracker != nullptr); + init(type, "", TUniqueId(), mem_tracker); + } + + explicit AttachTaskThread(const TQueryType::type& query_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + DCHECK(task_id != "" && fragment_instance_id != TUniqueId() && mem_tracker != nullptr); + if (query_type == TQueryType::SELECT) { + init(ThreadContext::TaskType::QUERY, task_id, fragment_instance_id, mem_tracker); + } else if (query_type == TQueryType::LOAD) { + init(ThreadContext::TaskType::LOAD, task_id, fragment_instance_id, mem_tracker); + } + } + + void init(const ThreadContext::TaskType& type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + // thread_local_ctx.get()->attach(type, task_id, fragment_instance_id, mem_tracker); + } + + // ~AttachTaskThread() { thread_local_ctx.get()->detach(); } +}; + +class AttachTaskThreadP { +public: + explicit AttachTaskThreadP(const ThreadContext::TaskType& type, + const std::shared_ptr mem_tracker) { + DCHECK(mem_tracker != nullptr); + init(type, "", TUniqueId(), mem_tracker); + } + + explicit AttachTaskThreadP(const TQueryType::type& query_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + DCHECK(task_id != "" && fragment_instance_id != TUniqueId() && mem_tracker != nullptr); + if (query_type == TQueryType::SELECT) { + init(ThreadContext::TaskType::QUERY, task_id, fragment_instance_id, mem_tracker); + } else if (query_type == TQueryType::LOAD) { + init(ThreadContext::TaskType::LOAD, task_id, fragment_instance_id, mem_tracker); + } + } + + void init(const ThreadContext::TaskType& type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + // thread_local_ctx.get()->attach(type, task_id, fragment_instance_id, mem_tracker); + } + + // ~AttachTaskThreadP() { thread_local_ctx.get()->detach(); } +}; + +class SwitchThreadMemTracker { +public: + explicit SwitchThreadMemTracker(const std::shared_ptr& mem_tracker) { + DCHECK(mem_tracker != nullptr); + // _old_tracker_id = thread_local_ctx.get()->_thread_mem_tracker_mgr->update_tracker(mem_tracker); + } + + ~SwitchThreadMemTracker() { + // thread_local_ctx.get()->_thread_mem_tracker_mgr->set_tracker_id(_old_tracker_id); + } + +private: + std::string _old_tracker_id; +}; + +class SwitchThreadMemTrackerP { +public: + explicit SwitchThreadMemTrackerP(const std::shared_ptr& mem_tracker) { + DCHECK(mem_tracker != nullptr); + // _old_tracker_id = thread_local_ctx.get()->_thread_mem_tracker_mgr->update_trackerP(mem_tracker); + } + + ~SwitchThreadMemTrackerP() { + // thread_local_ctx.get()->_thread_mem_tracker_mgr->set_tracker_idP(_old_tracker_id); + } + +private: + std::string _old_tracker_id; +}; + +class SwitchThreadMemTrackerCallBack { +public: + explicit SwitchThreadMemTrackerCallBack(const std::string& action_type) { + DCHECK(action_type != std::string()); + init(action_type); + } + + explicit SwitchThreadMemTrackerCallBack(const std::string& action_type, bool cancel_work) { + DCHECK(action_type != std::string()); + init(action_type, cancel_work); + } + + explicit SwitchThreadMemTrackerCallBack(const std::string& action_type, bool cancel_work, + ERRCALLBACK err_call_back_func) { + DCHECK(action_type != std::string() && err_call_back_func != nullptr); + init(action_type, cancel_work, err_call_back_func); + } + + void init(const std::string& action_type = std::string(), bool cancel_work = true, + ERRCALLBACK err_call_back_func = nullptr) { + _old_tracker_call_back = thread_local_ctx.get()->_thread_mem_tracker_mgr->update_consume_err_call_back( + action_type, cancel_work, err_call_back_func); + } + + ~SwitchThreadMemTrackerCallBack() { + thread_local_ctx.get()->_thread_mem_tracker_mgr->update_consume_err_call_back(_old_tracker_call_back); + } + +private: + ConsumeErrCallBackInfo _old_tracker_call_back; +}; + +class StopThreadMemTracker { +public: + explicit StopThreadMemTracker(const bool scope = true) : _scope(scope) { + thread_mem_tracker_mgr_init = false; + } + + ~StopThreadMemTracker() { + if (_scope == true) thread_mem_tracker_mgr_init = true; + } + +private: + bool _scope; +}; + +} // namespace doris diff --git a/be/src/runtime/thread_mem_tracker_mgr.cpp b/be/src/runtime/thread_mem_tracker_mgr.cpp new file mode 100644 index 00000000000000..39915a3c0211ad --- /dev/null +++ b/be/src/runtime/thread_mem_tracker_mgr.cpp @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/thread_mem_tracker_mgr.h" + +#include "runtime/mem_tracker_task_pool.h" +#include "service/backend_options.h" + +namespace doris { + +void ThreadMemTrackerMgr::attach_task(const std::string& action_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker) { + _task_id = task_id; + _fragment_instance_id = fragment_instance_id; + _consume_err_call_back.update(action_type, true, nullptr); + if (mem_tracker == nullptr) { +#ifdef BE_TEST + if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) { + return; + } +#endif + _temp_task_mem_tracker = ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker(task_id); + update_tracker(_temp_task_mem_tracker); + } else { + update_tracker(mem_tracker); + } +} + +void ThreadMemTrackerMgr::detach_task() { + _task_id = ""; + _fragment_instance_id = TUniqueId(); + _consume_err_call_back.init(); + clear_untracked_mems(); + _tracker_id = "process"; + _untracked_mems.clear(); + _untracked_mems["process"] = 0; + _mem_trackers.clear(); + _mem_trackers["process"] = MemTracker::get_process_tracker(); +} + +void ThreadMemTrackerMgr::exceeded_cancel_task(const std::string& cancel_details) { + _temp_task_mem_tracker = + ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker( + _task_id); + if (_temp_task_mem_tracker != nullptr && _temp_task_mem_tracker->limit_exceeded() && + _fragment_instance_id != TUniqueId() && ExecEnv::GetInstance()->initialized() && + ExecEnv::GetInstance()->fragment_mgr()->is_canceling(_fragment_instance_id).ok()) { + ExecEnv::GetInstance()->fragment_mgr()->cancel( + _fragment_instance_id, PPlanFragmentCancelReason::MEMORY_LIMIT_EXCEED, + cancel_details); + _fragment_instance_id = TUniqueId(); // Make sure it will only be canceled once + } +} + +void ThreadMemTrackerMgr::exceeded(int64_t mem_usage, Status st) { + auto rst = _mem_trackers[_tracker_id]->mem_limit_exceeded( + nullptr, "In TCMalloc Hook, " + _consume_err_call_back.action_type, mem_usage, st); + if (_consume_err_call_back.call_back_func != nullptr) { + _consume_err_call_back.call_back_func(); + } + if (_task_id != "" && _consume_err_call_back.cancel_task == true) { + exceeded_cancel_task(rst.to_string()); + } +} + +} // namespace doris diff --git a/be/src/runtime/thread_mem_tracker_mgr.h b/be/src/runtime/thread_mem_tracker_mgr.h new file mode 100644 index 00000000000000..9869c1aeabeac7 --- /dev/null +++ b/be/src/runtime/thread_mem_tracker_mgr.h @@ -0,0 +1,322 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/exec_env.h" +#include "runtime/fragment_mgr.h" +#include "runtime/mem_tracker.h" + +namespace doris { + +typedef void (*ERRCALLBACK)(); + +struct ConsumeErrCallBackInfo { + std::string action_type; + bool cancel_task; // Whether to cancel the task when the current tracker exceeds the limit + ERRCALLBACK call_back_func; + + ConsumeErrCallBackInfo() { + init(); + } + + ConsumeErrCallBackInfo(std::string action_type, bool cancel_task, ERRCALLBACK call_back_func) + : action_type(action_type), cancel_task(cancel_task), call_back_func(call_back_func) {} + + void update(std::string new_action_type, bool new_cancel_task, ERRCALLBACK new_call_back_func) { + action_type = new_action_type; + cancel_task = new_cancel_task; + call_back_func = new_call_back_func; + } + + void init() { + action_type = ""; + cancel_task = false; + call_back_func = nullptr; + } +}; + +// If there is a memory new/delete operation in the consume method, it may enter infinite recursion. +// Note: After the tracker is stopped, the memory alloc in the consume method should be released in time, +// otherwise the MemTracker statistics will be inaccurate. +// In some cases, we want to turn off thread automatic memory statistics, manually call consume. +// In addition, when ~RootTracker, TCMalloc delete hook release RootTracker will crash. +inline thread_local bool thread_mem_tracker_mgr_init = false; + +// TCMalloc new/delete Hook is counted in the memory_tracker of the current thread. +// +// In the original design, the MemTracker consume method is called before the memory is allocated. +// If the consume succeeds, the memory is actually allocated, otherwise an exception is thrown. +// But the statistics of memory through TCMalloc new/delete Hook are after the memory is actually allocated, +// which is different from the previous behavior. Therefore, when alloc for some large memory, +// need to manually call cosume after stop_mem_tracker, and then start_mem_tracker. +class ThreadMemTrackerMgr { +public: + ThreadMemTrackerMgr() { + _mem_trackers["process"] = MemTracker::get_process_tracker(); + _untracked_mems["process"] = 0; + _tracker_id = "process"; + thread_mem_tracker_mgr_init = true; + } + ~ThreadMemTrackerMgr() { + clear_untracked_mems(); + thread_mem_tracker_mgr_init = false; + } + + void clear_untracked_mems() { + for(auto untracked_mem : _untracked_mems) { + // auto tracker = _mem_trackers[untracked_mem.first].lock(); + if (untracked_mem.second != 0) { + // if (_mem_trackers[untracked_mem.first]) { + _mem_trackers[untracked_mem.first]->consume(untracked_mem.second); + // _mem_trackers[untracked_mem.first]->consume(untracked_mem.second); + // } else { + // DCHECK(_tracker_id == "process"); + // _root_mem_tracker->consume(untracked_mem.second); + // } + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(untracked_mem.second); + // } + } + } + _mem_trackers[_tracker_id]->consume(_untracked_mem); + _untracked_mem = 0; + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem); + // } + if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + _untracked_mem2 = 0; + } + } + + // After attach, the current thread TCMalloc Hook starts to consume/release task mem_tracker + void attach_task(const std::string& action_type, const std::string& task_id, + const TUniqueId& fragment_instance_id, + const std::shared_ptr& mem_tracker); + + void detach_task(); + + // Must be fast enough!!! + // Thread update_tracker may be called very frequently, adding a memory copy will be slow. + std::string update_tracker(const std::shared_ptr& mem_tracker); + std::string update_trackerP(const std::shared_ptr& mem_tracker); + + void set_tracker_id(const std::string& tracker_id) { + // DCHECK(_untracked_mem == 0); + if (tracker_id != _tracker_id) { + // _untracked_mems[_tracker_id] += _untracked_mem; + _mem_trackers[_tracker_id]->consume_cache(_untracked_mem); + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + // } + _untracked_mem = 0; + _tracker_id = tracker_id; + } + // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + + // _untracked_mem = _untracked_mems[_tracker_id]; + // _untracked_mems[_tracker_id] = 0; + } + + void set_tracker_idP(const std::string& tracker_id) { + if (tracker_id != _tracker_id) { + _mem_trackers[_tracker_id]->consume_cache(_untracked_mem); + _untracked_mem = 0; + _tracker_id = tracker_id; + } + } + + inline ConsumeErrCallBackInfo update_consume_err_call_back( + const std::string& action_type, bool cancel_task, ERRCALLBACK call_back_func); + + inline void update_consume_err_call_back(ConsumeErrCallBackInfo& consume_err_call_back) { + _consume_err_call_back = consume_err_call_back; + } + + // Note that, If call the memory allocation operation in TCMalloc new/delete Hook, + // such as calling LOG/iostream/sstream/stringstream/etc. related methods, + // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, + void cache_consume(int64_t size); + + void noncache_consume(); + + // Frequent weak_ptr.lock() is expensive + std::shared_ptr mem_tracker() { + // if (_shared_mem_tracker == nullptr || _shared_mem_tracker->id() != _tracker_id) { + // _shared_mem_tracker = _mem_trackers[_tracker_id]; + // } + // if (_mem_trackers[_tracker_id]) { + return _mem_trackers[_tracker_id]; + // } else { + // DCHECK(_tracker_id == "process"); + // return MemTracker::get_root_tracker(); + // } + } + +private: + // If tryConsume fails due to task mem tracker exceeding the limit, the task must be canceled + void exceeded_cancel_task(const std::string& cancel_details); + + void exceeded(int64_t mem_usage, Status st); + +private: + // 避免shared ptr use count 费 + std::unordered_map> _mem_trackers; + // lable + timestamp + std::string _tracker_id; + // MemTracker* _process_mem_tracker; + + // Consume size smaller than mem_tracker_consume_min_size_bytes will continue to accumulate + // to avoid frequent calls to consume/release of MemTracker. + std::unordered_map _untracked_mems; + // Cache untracked mem, only update to _untracked_mems when switching mem tracker. + // Frequent calls to unordered_map _untracked_mems[] in cache_consume will degrade performance. + int64_t _untracked_mem = 0; + int64_t _untracked_mem2 = 0; + + ConsumeErrCallBackInfo _consume_err_call_back; + + // Avoid memory allocation in functions and fall into an infinite loop + std::string _temp_tracker_id; + ConsumeErrCallBackInfo _temp_consume_err_call_back; + std::shared_ptr _temp_task_mem_tracker; + + std::string _task_id; + TUniqueId _fragment_instance_id; +}; + +inline std::string ThreadMemTrackerMgr::update_tracker(const std::shared_ptr& mem_tracker) { + DCHECK(mem_tracker != nullptr); + DCHECK(_mem_trackers[_tracker_id]); + _temp_tracker_id = mem_tracker->id(); + if (_temp_tracker_id == _tracker_id) { + return _tracker_id; + } + if (_mem_trackers.find(_temp_tracker_id) == _mem_trackers.end()) { + _mem_trackers[_temp_tracker_id] = mem_tracker; + _untracked_mems[_temp_tracker_id] = 0; + } + // _untracked_mems[_tracker_id] += _untracked_mem; + _mem_trackers[_tracker_id]->consume(_untracked_mem); + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + // } + _untracked_mem = 0; + std::swap(_tracker_id, _temp_tracker_id); + // if (_mem_trackers.find(_temp_tracker_id) == _mem_trackers.end()) { + // _mem_trackers[_temp_tracker_id] = mem_tracker; + // _untracked_mems[_tracker_id] += _untracked_mem; + // _untracked_mem = 0; + // // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + // // DCHECK(_untracked_mem == 0); + // std::swap(_tracker_id, _temp_tracker_id); + // _untracked_mems[_tracker_id] = 0; + // } else { + // // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + // // DCHECK(_untracked_mem == 0); + // _untracked_mems[_tracker_id] += _untracked_mem; + // _untracked_mem = 0; + // std::swap(_tracker_id, _temp_tracker_id); + // // std::swap(_untracked_mems[_tracker_id], _untracked_mem); + // } + DCHECK(_mem_trackers[_temp_tracker_id]); + return _temp_tracker_id; // old tracker_id + // return _tracker_id; +} + +inline std::string ThreadMemTrackerMgr::update_trackerP(const std::shared_ptr& mem_tracker) { + _temp_tracker_id = mem_tracker->id(); + if (_temp_tracker_id == _tracker_id) { + return _tracker_id; + } + if (_mem_trackers.find(_temp_tracker_id) == _mem_trackers.end()) { + _mem_trackers[_temp_tracker_id] = mem_tracker; + _untracked_mems[_temp_tracker_id] = 0; + } + _mem_trackers[_tracker_id]->consume(_untracked_mem); + _untracked_mem = 0; + std::swap(_tracker_id, _temp_tracker_id); + DCHECK(_mem_trackers[_temp_tracker_id]); + return _temp_tracker_id; // old tracker_id + // return _tracker_id; +} + +inline ConsumeErrCallBackInfo ThreadMemTrackerMgr::update_consume_err_call_back( + const std::string& action_type, bool cancel_task, ERRCALLBACK call_back_func) { + _temp_consume_err_call_back = _consume_err_call_back; + _consume_err_call_back.update(action_type, cancel_task, call_back_func); + return _temp_consume_err_call_back; +} + +inline void ThreadMemTrackerMgr::cache_consume(int64_t size) { + // _untracked_mems[_tracker_id] += size; + _untracked_mem += size; + _untracked_mem2 += size; + // When some threads `0 < _untracked_mem < config::mem_tracker_consume_min_size_bytes` + // and some threads `_untracked_mem <= -config::mem_tracker_consume_min_size_bytes` trigger consumption(), + // it will cause tracker->consumption to be temporarily less than 0. + if (_untracked_mem >= config::mem_tracker_consume_min_size_bytes || + _untracked_mem <= -config::mem_tracker_consume_min_size_bytes) { + // DCHECK(_mem_trackers.find(_tracker_id) != _mem_trackers.end()); + thread_mem_tracker_mgr_init = false; + if (_untracked_mems[_tracker_id] != 0) { + _untracked_mem += _untracked_mems[_tracker_id]; + _untracked_mems[_tracker_id] = 0; + } + noncache_consume(); + // _untracked_mem = 0; + // _untracked_mem2 = 0; + thread_mem_tracker_mgr_init = true; + } + + + if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + if (_untracked_mem2 >= config::mem_tracker_consume_min_size_bytes || + _untracked_mem2 <= -config::mem_tracker_consume_min_size_bytes) { + ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + _untracked_mem2 = 0; + } + } +} + +inline void ThreadMemTrackerMgr::noncache_consume() { + // if (ExecEnv::GetInstance()->new_process_mem_tracker()) { + // ExecEnv::GetInstance()->new_process_mem_tracker()->consume(_untracked_mem2); + // } + // Ensure thread safety + // auto tracker = _mem_trackers[_tracker_id].lock(); + if (_mem_trackers[_tracker_id]) { + Status st = _mem_trackers[_tracker_id]->try_consume(_untracked_mem); + if (!st) { + // The memory has been allocated, so when TryConsume fails, need to continue to complete + // the consume to ensure the accuracy of the statistics. + _mem_trackers[_tracker_id]->consume(_untracked_mem); + exceeded(_untracked_mem, st); + } + _untracked_mem = 0; + } + // else { + // DCHECK(_tracker_id == "process"); + // _mem_trackers["process"] = ExecEnv::GetInstance()->process_mem_tracker(); + // _root_mem_tracker->consume(_untracked_mems[_tracker_id]); + // } +} + +} // namespace doris diff --git a/be/src/runtime/threadlocal.cc b/be/src/runtime/threadlocal.cc new file mode 100644 index 00000000000000..ac2bf2e62a9094 --- /dev/null +++ b/be/src/runtime/threadlocal.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "runtime/threadlocal.h" + +#include + +#include +#include +#include + +#include "common/logging.h" +#include "gutil/once.h" +#include "util/errno.h" + +namespace doris { + +// One key used by the entire process to attach destructors on thread exit. +static pthread_key_t destructors_key; + +// The above key must only be initialized once per process. +static GoogleOnceType once = GOOGLE_ONCE_INIT; + +namespace { + +// List of destructors for all thread locals instantiated on a given thread. +struct PerThreadDestructorList { + void (*destructor)(void*); + void* arg; + PerThreadDestructorList* next; +}; + +} // anonymous namespace + +// Call all the destructors associated with all THREAD_LOCAL instances in this +// thread. +static void invoke_destructors(void* t) { + PerThreadDestructorList* d = reinterpret_cast(t); + while (d != nullptr) { + d->destructor(d->arg); + PerThreadDestructorList* next = d->next; + delete d; + d = next; + } +} + +// This key must be initialized only once. +static void create_key() { + int ret = pthread_key_create(&destructors_key, &invoke_destructors); + // Linux supports up to 1024 keys, we will use only one for all thread locals. + CHECK_EQ(0, ret) << "pthread_key_create() failed, cannot add destructor to thread: " + << "error " << ret << ": " << errno_to_string(ret); +} + +// Adds a destructor to the list. +void add_destructor(void (*destructor)(void*), void* arg) { + GoogleOnceInit(&once, &create_key); + + // Returns NULL if nothing is set yet. + std::unique_ptr p(new PerThreadDestructorList()); + p->destructor = destructor; + p->arg = arg; + p->next = reinterpret_cast(pthread_getspecific(destructors_key)); + int ret = pthread_setspecific(destructors_key, p.release()); + // The only time this check should fail is if we are out of memory, or if + // somehow key creation failed, which should be caught by the above CHECK. + CHECK_EQ(0, ret) << "pthread_setspecific() failed, cannot update destructor list: " + << "error " << ret << ": " << errno_to_string(ret); +} + +} // namespace doris diff --git a/be/src/runtime/threadlocal.h b/be/src/runtime/threadlocal.h new file mode 100644 index 00000000000000..53956589595ae4 --- /dev/null +++ b/be/src/runtime/threadlocal.h @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef THREADLOCAL_H_ +#define THREADLOCAL_H_ + +// Block-scoped static thread local implementation. +// +// Usage is similar to a C++11 thread_local. The BLOCK_STATIC_THREAD_LOCAL macro +// defines a thread-local pointer to the specified type, which is lazily +// instantiated by any thread entering the block for the first time. The +// constructor for the type T is invoked at macro execution time, as expected, +// and its destructor is invoked when the corresponding thread's Runnable +// returns, or when the thread exits. +// +// Inspired by Poco , +// Andrew Tomazos , and +// the C++11 thread_local API. +// +// Example usage: +// +// // Invokes a 3-arg constructor on SomeClass: +// BLOCK_STATIC_THREAD_LOCAL(SomeClass, instance, arg1, arg2, arg3); +// instance->DoSomething(); +// +#define BLOCK_STATIC_THREAD_LOCAL(T, t, ...) \ + static __thread T* t; \ + do { \ + if (PREDICT_FALSE(t == NULL)) { \ + t = new T(__VA_ARGS__); \ + add_destructor(destroy, t); \ + } \ + } while (false) + +// Class-scoped static thread local implementation. +// +// Very similar in implementation to the above block-scoped version, but +// requires a bit more syntax and vigilance to use properly. +// +// DECLARE_STATIC_THREAD_LOCAL(Type, instance_var_) must be placed in the +// class header, as usual for variable declarations. +// +// Because these variables are static, they must also be defined in the impl +// file with DEFINE_STATIC_THREAD_LOCAL(Type, Classname, instance_var_), +// which is very much like defining any static member, i.e. int Foo::member_. +// +// Finally, each thread must initialize the instance before using it by calling +// INIT_STATIC_THREAD_LOCAL(Type, instance_var_, ...). This is a cheap +// call, and may be invoked at the top of any method which may reference a +// thread-local variable. +// +// Due to all of these requirements, you should probably declare TLS members +// as private. +// +// Example usage: +// +// // foo.h +// #include "kudu/utils/file.h" +// class Foo { +// public: +// void DoSomething(std::string s); +// private: +// DECLARE_STATIC_THREAD_LOCAL(utils::File, file_); +// }; +// +// // foo.cc +// #include "kudu/foo.h" +// DEFINE_STATIC_THREAD_LOCAL(utils::File, Foo, file_); +// void Foo::WriteToFile(std::string s) { +// // Call constructor if necessary. +// INIT_STATIC_THREAD_LOCAL(utils::File, file_, "/tmp/file_location.txt"); +// file_->Write(s); +// } + +// Goes in the class declaration (usually in a header file). +// dtor must be destructed _after_ t, so it gets defined first. +// Uses a mangled variable name for dtor since it must also be a member of the +// class. +#define DECLARE_STATIC_THREAD_LOCAL(T, t) static __thread T* t + +// You must also define the instance in the .cc file. +#define DEFINE_STATIC_THREAD_LOCAL(T, Class, t) __thread T* Class::t + +// Must be invoked at least once by each thread that will access t. +#define INIT_STATIC_THREAD_LOCAL(T, t, ...) \ + do { \ + if (PREDICT_FALSE(t == NULL)) { \ + t = new T(__VA_ARGS__); \ + add_destructor(destroy, t); \ + } \ + } while (false) + +// Internal implementation below. + +namespace doris { + +// Add a destructor to the list. +void add_destructor(void (*destructor)(void*), void* arg); + +// Destroy the passed object of type T. +template +static void destroy(void* t) { + // With tcmalloc, this should be pretty cheap (same thread as new). + delete reinterpret_cast(t); +} + +} // namespace doris + +#endif // THREADLOCAL_H_ diff --git a/be/src/runtime/vectorized_row_batch.cpp b/be/src/runtime/vectorized_row_batch.cpp index 1fcdcd93582fc8..06a19fcd9e6ea8 100644 --- a/be/src/runtime/vectorized_row_batch.cpp +++ b/be/src/runtime/vectorized_row_batch.cpp @@ -23,14 +23,12 @@ namespace doris { VectorizedRowBatch::VectorizedRowBatch(const TabletSchema* schema, - const std::vector& cols, int capacity, - const std::shared_ptr& parent_tracker) + const std::vector& cols, int capacity) : _schema(schema), _cols(cols), _capacity(capacity), _limit(capacity) { _selected_in_use = false; _size = 0; - _tracker = MemTracker::CreateTracker(-1, "VectorizedRowBatch", parent_tracker); - _mem_pool.reset(new MemPool(_tracker.get())); + _mem_pool.reset(new MemPool()); _selected = reinterpret_cast(new char[sizeof(uint16_t) * _capacity]); diff --git a/be/src/runtime/vectorized_row_batch.h b/be/src/runtime/vectorized_row_batch.h index 2f29f38cc345dc..6819f01c6ab1fc 100644 --- a/be/src/runtime/vectorized_row_batch.h +++ b/be/src/runtime/vectorized_row_batch.h @@ -61,8 +61,7 @@ class ColumnVector { class VectorizedRowBatch { public: - VectorizedRowBatch(const TabletSchema* schema, const std::vector& cols, int capacity, - const std::shared_ptr& parent_tracker = nullptr); + VectorizedRowBatch(const TabletSchema* schema, const std::vector& cols, int capacity); ~VectorizedRowBatch() { for (auto vec : _col_vectors) { @@ -120,7 +119,6 @@ class VectorizedRowBatch { bool _selected_in_use = false; uint8_t _block_status; - std::shared_ptr _tracker; std::unique_ptr _mem_pool; uint16_t _limit; }; diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index fba366931eb391..c181db12ed90b9 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -46,6 +46,8 @@ #include "runtime/exec_env.h" #include "runtime/heartbeat_flags.h" #include "runtime/minidump.h" +#include "runtime/tcmalloc_hook.h" +#include "runtime/thread_context.h" #include "service/backend_options.h" #include "service/backend_service.h" #include "service/brpc_service.h" @@ -75,7 +77,6 @@ static void thrift_output(const char* x) { } // namespace doris int main(int argc, char** argv) { - // check if print version or help if (argc > 1) { if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { @@ -133,6 +134,10 @@ int main(int argc, char** argv) { return -1; } + if (doris::config::use_tc_hook) { + init_hook(); + } + #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) // Aggressive decommit is required so that unused pages in the TCMalloc page heap are // not backed by physical pages and do not contribute towards memory consumption. @@ -263,7 +268,8 @@ int main(int argc, char** argv) { status = heartbeat_thrift_server->start(); if (!status.ok()) { - LOG(ERROR) << "Doris BE HeartBeat Service did not start correctly, exiting: " << status.get_error_msg(); + LOG(ERROR) << "Doris BE HeartBeat Service did not start correctly, exiting: " + << status.get_error_msg(); doris::shutdown_logging(); exit(1); } @@ -285,6 +291,72 @@ int main(int argc, char** argv) { #if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER) doris::MemInfo::refresh_current_mem(); #endif + // LOG(WARNING) << "free_thread_ctx 1111 " << doris::free_thread_ctx.size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::free_thread_ctx.begin(); it != doris::free_thread_ctx.end();) { + // // if (pthread_kill((*it)->pthread_id(), 0) == ESRCH) { + // // if (!(*it)->pthread_id() || pthread_tryjoin_np((*it)->pthread_id(), NULL) != EBUSY) { + // // delete *it; + // // it = doris::free_thread_ctx.erase(it); + // // } else { + // // it++; + // // } + // delete *it; + // it = doris::free_thread_ctx.erase(it); + // } + // } + // LOG(WARNING) << "free_thread_ctx 2222 " << doris::free_thread_ctx.size(); + + // LOG(WARNING) << "free_thread_ctx 1111 " << doris::ExecEnv::GetInstance()->free_thread_ctx().size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::ExecEnv::GetInstance()->free_thread_ctx().begin(); it != doris::ExecEnv::GetInstance()->free_thread_ctx().end();) { + // // if (pthread_kill((*it)->pthread_id(), 0) == ESRCH) { + // // if (!(*it)->pthread_id() || pthread_tryjoin_np((*it)->pthread_id(), NULL) != EBUSY) { + // // delete *it; + // // it = doris::free_thread_ctx.erase(it); + // // } else { + // // it++; + // // } + // delete *it; + // it = doris::ExecEnv::GetInstance()->free_thread_ctx().erase(it); + // } + // } + // LOG(WARNING) << "free_thread_ctx 2222 " << doris::ExecEnv::GetInstance()->free_thread_ctx().size(); + + // LOG(WARNING) << "free2_thread_ctx 1111 " << doris::free2_thread_ctx.size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::free2_thread_ctx.begin(); it != doris::free2_thread_ctx.end();) { + // LOG(WARNING) << "free2_thread_ctx 33333 " << *it; + // LOG(WARNING) << "free2_thread_ctx 44444 " << (**it == nullptr); + // if (*it == nullptr) { + // // delete *it; + // it = doris::free2_thread_ctx.erase(it); + // } else { + // it++; + // } + // } + // } + // LOG(WARNING) << "free2_thread_ctx 2222 " << doris::free2_thread_ctx.size(); + + // LOG(WARNING) << "free3_thread_ctx 1111 " << doris::free3_thread_ctx.size(); + // { + // // std::lock_guard l(doris::free_thread_ctx_lock); + // for (auto it = doris::free3_thread_ctx.begin(); it != doris::free3_thread_ctx.end();) { + // LOG(WARNING) << "free3_thread_ctx 33333 " << (*it).first << " : " << (*it).second; + // if ((*it).first != 0) { + // // delete *it; + // it = doris::free3_thread_ctx.erase(it); + // } else { + // it++; + // } + // } + // } + // LOG(WARNING) << "free3_thread_ctx 2222 " << doris::free3_thread_ctx.size(); + + doris::ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->logout_task_mem_tracker(); sleep(10); } @@ -315,4 +387,3 @@ static void help(const char* progname) { printf(" -v, --version output version information, then exit\n"); printf(" -?, --help show this help, then exit\n"); } - diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index e19427ff92472c..253b130d1ee944 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -54,7 +54,7 @@ HttpService::HttpService(ExecEnv* env, int port, int num_threads) HttpService::~HttpService() {} Status HttpService::start() { - add_default_path_handlers(_web_page_handler.get(), _env->process_mem_tracker()); + add_default_path_handlers(_web_page_handler.get(), MemTracker::get_process_tracker()); // register load MiniLoadAction* miniload_action = _pool.add(new MiniLoadAction(_env)); diff --git a/be/src/testutil/function_utils.cpp b/be/src/testutil/function_utils.cpp index 2ebb4c22f8b94a..28aaeb2455d977 100644 --- a/be/src/testutil/function_utils.cpp +++ b/be/src/testutil/function_utils.cpp @@ -20,7 +20,6 @@ #include #include "runtime/mem_pool.h" -#include "runtime/mem_tracker.h" #include "udf/udf_internal.h" namespace doris { @@ -28,8 +27,7 @@ namespace doris { FunctionUtils::FunctionUtils() { doris_udf::FunctionContext::TypeDesc return_type; std::vector arg_types; - _mem_tracker.reset(new MemTracker(-1, "function util")); - _memory_pool = new MemPool(_mem_tracker.get()); + _memory_pool = new MemPool(); _fn_ctx = FunctionContextImpl::create_context(_state, _memory_pool, return_type, arg_types, 0, false); } @@ -37,8 +35,7 @@ FunctionUtils::FunctionUtils(RuntimeState* state) { _state = state; doris_udf::FunctionContext::TypeDesc return_type; std::vector arg_types; - _mem_tracker.reset(new MemTracker(-1, "function util")); - _memory_pool = new MemPool(_mem_tracker.get()); + _memory_pool = new MemPool(); _fn_ctx = FunctionContextImpl::create_context(_state, _memory_pool, return_type, arg_types, 0, false); } @@ -46,8 +43,7 @@ FunctionUtils::FunctionUtils(RuntimeState* state) { FunctionUtils::FunctionUtils(const doris_udf::FunctionContext::TypeDesc& return_type, const std::vector& arg_types, int varargs_buffer_size) { - _mem_tracker.reset(new MemTracker(-1, "function util")); - _memory_pool = new MemPool(_mem_tracker.get()); + _memory_pool = new MemPool(); _fn_ctx = FunctionContextImpl::create_context(_state, _memory_pool, return_type, arg_types, varargs_buffer_size, false); } diff --git a/be/src/testutil/function_utils.h b/be/src/testutil/function_utils.h index 30b2a6cdffdbeb..0b0902ddac9d7c 100644 --- a/be/src/testutil/function_utils.h +++ b/be/src/testutil/function_utils.h @@ -23,7 +23,6 @@ namespace doris { class MemPool; -class MemTracker; class RuntimeState; class FunctionUtils { @@ -39,7 +38,6 @@ class FunctionUtils { private: RuntimeState* _state = nullptr; - std::shared_ptr _mem_tracker; MemPool* _memory_pool = nullptr; doris_udf::FunctionContext* _fn_ctx = nullptr; }; diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index 3b0e0ca1db055a..590bf52017491a 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -365,9 +365,8 @@ class ToRowBatchConverter : public arrow::ArrayVisitor { public: using arrow::ArrayVisitor::Visit; - ToRowBatchConverter(const arrow::RecordBatch& batch, const RowDescriptor& row_desc, - const std::shared_ptr& tracker) - : _batch(batch), _row_desc(row_desc), _tracker(tracker) {} + ToRowBatchConverter(const arrow::RecordBatch& batch, const RowDescriptor& row_desc) + : _batch(batch), _row_desc(row_desc) {} #define PRIMITIVE_VISIT(TYPE) \ arrow::Status Visit(const arrow::TYPE& array) override { return _visit(array); } @@ -407,7 +406,6 @@ class ToRowBatchConverter : public arrow::ArrayVisitor { private: const arrow::RecordBatch& _batch; const RowDescriptor& _row_desc; - std::shared_ptr _tracker; std::unique_ptr _cur_slot_ref; std::shared_ptr _output; @@ -427,7 +425,7 @@ Status ToRowBatchConverter::convert(std::shared_ptr* result) { // TODO(zc): check if field type match size_t num_rows = _batch.num_rows(); - _output.reset(new RowBatch(_row_desc, num_rows, _tracker.get())); + _output.reset(new RowBatch(_row_desc, num_rows)); _output->commit_rows(num_rows); auto pool = _output->tuple_data_pool(); for (size_t row_id = 0; row_id < num_rows; ++row_id) { @@ -453,9 +451,8 @@ Status ToRowBatchConverter::convert(std::shared_ptr* result) { } Status convert_to_row_batch(const arrow::RecordBatch& batch, const RowDescriptor& row_desc, - const std::shared_ptr& tracker, std::shared_ptr* result) { - ToRowBatchConverter converter(batch, row_desc, tracker); + ToRowBatchConverter converter(batch, row_desc); return converter.convert(result); } diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h index a7c2f3991d5012..f75b060502f411 100644 --- a/be/src/util/arrow/row_batch.h +++ b/be/src/util/arrow/row_batch.h @@ -35,7 +35,6 @@ class Schema; namespace doris { -class MemTracker; class ObjectPool; class RowBatch; class RowDescriptor; @@ -56,10 +55,8 @@ Status convert_to_arrow_batch(const RowBatch& batch, const std::shared_ptr* result); // Convert an Arrow RecordBatch to a Doris RowBatch. A valid RowDescriptor -// whose schema is the same with RecordBatch's should be given. Memory used -// by result RowBatch will be tracked by tracker. +// whose schema is the same with RecordBatch's should be given. Status convert_to_row_batch(const arrow::RecordBatch& batch, const RowDescriptor& row_desc, - const std::shared_ptr& tracker, std::shared_ptr* result); Status serialize_record_batch(const arrow::RecordBatch& record_batch, std::string* result); diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index 8015dcaefac3f1..e99f5980fcd32f 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -183,6 +183,7 @@ class DorisMetrics { UIntGauge* compaction_mem_consumption; UIntGauge* load_mem_consumption; + UIntGauge* load_channel_mem_consumption; UIntGauge* query_mem_consumption; UIntGauge* schema_change_mem_consumption; UIntGauge* tablet_meta_mem_consumption; diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h index 6ae8669f868d50..b2c556e60f4c22 100644 --- a/be/src/util/mem_info.h +++ b/be/src/util/mem_info.h @@ -34,6 +34,8 @@ class MemInfo { // Initialize MemInfo. static void init(); + static inline bool initialized() { return _s_initialized; } + // Get total physical memory in bytes (if has cgroups memory limits, return the limits). static inline int64_t physical_mem() { DCHECK(_s_initialized); diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index 4fa54291820ca5..404d6dc2230b29 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -20,6 +20,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/mem_tracker.h" #include "runtime/runtime_filter_mgr.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "vec/core/materialize_block.h" #include "vec/exprs/vexpr.h" @@ -50,7 +51,7 @@ struct ProcessHashTableBuild { Defer defer {[&]() { int64_t bucket_size = hash_table_ctx.hash_table.get_buffer_size_in_cells(); int64_t bucket_bytes = hash_table_ctx.hash_table.get_buffer_size_in_bytes(); - _join_node->_mem_tracker->Consume(bucket_bytes - old_bucket_bytes); + _join_node->_hash_table_mem_tracker->consume(bucket_bytes - old_bucket_bytes); _join_node->_mem_used += bucket_bytes - old_bucket_bytes; COUNTER_SET(_join_node->_build_buckets_counter, bucket_size); }}; @@ -596,6 +597,8 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status HashJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _hash_table_mem_tracker = MemTracker::create_virtual_tracker(-1, "VSetOperationNode:HashTable"); // Build phase auto build_phase_profile = runtime_profile()->create_child("BuildPhase", true, true); @@ -642,10 +645,11 @@ Status HashJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); if (_vother_join_conjunct_ptr) (*_vother_join_conjunct_ptr)->close(state); - _mem_tracker->Release(_mem_used); + _hash_table_mem_tracker->release(_mem_used); return ExecNode::close(state); } @@ -783,6 +787,7 @@ Status HashJoinNode::get_next(RuntimeState* state, Block* output_block, bool* eo } Status HashJoinNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -802,6 +807,7 @@ Status HashJoinNode::open(RuntimeState* state) { Status HashJoinNode::_hash_table_build(RuntimeState* state) { RETURN_IF_ERROR(child(1)->open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Hash join, while constructing the hash table"); SCOPED_TIMER(_build_timer); Block block; @@ -811,12 +817,9 @@ Status HashJoinNode::_hash_table_build(RuntimeState* state) { RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(child(1)->get_next(state, &block, &eos)); - _mem_tracker->Consume(block.allocated_bytes()); + _hash_table_mem_tracker->consume(block.allocated_bytes()); _mem_used += block.allocated_bytes(); - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while getting next from the child 1."); - RETURN_IF_ERROR(_process_build_block(state, block)); - RETURN_IF_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table."); } return std::visit( diff --git a/be/src/vec/exec/join/vhash_join_node.h b/be/src/vec/exec/join/vhash_join_node.h index 65b2328e3ff6fe..589bd5294227dc 100644 --- a/be/src/vec/exec/join/vhash_join_node.h +++ b/be/src/vec/exec/join/vhash_join_node.h @@ -197,6 +197,8 @@ class HashJoinNode : public ::doris::ExecNode { RowDescriptor _row_desc_for_other_join_conjunt; + std::shared_ptr _hash_table_mem_tracker; + private: Status _hash_table_build(RuntimeState* state); Status _process_build_block(RuntimeState* state, Block& block); diff --git a/be/src/vec/exec/vaggregation_node.cpp b/be/src/vec/exec/vaggregation_node.cpp index 76b4b349efc4d0..84678c526910ac 100644 --- a/be/src/vec/exec/vaggregation_node.cpp +++ b/be/src/vec/exec/vaggregation_node.cpp @@ -22,6 +22,7 @@ #include "exec/exec_node.h" #include "runtime/mem_pool.h" #include "runtime/row_batch.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "vec/core/block.h" #include "vec/data_types/data_type_nullable.h" @@ -203,11 +204,13 @@ void AggregationNode::_init_hash_method(std::vector& probe_exprs) Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _exec_timer = ADD_TIMER(runtime_profile(), "ExecTime"); _merge_timer = ADD_TIMER(runtime_profile(), "MergeTime"); _expr_timer = ADD_TIMER(runtime_profile(), "ExprTime"); _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); + _data_mem_tracker = MemTracker::create_virtual_tracker(-1, "AggregationNode:Data", mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); @@ -216,7 +219,7 @@ Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR( VExpr::prepare(_probe_expr_ctxs, state, child(0)->row_desc(), expr_mem_tracker())); - _mem_pool = std::make_unique(mem_tracker().get()); + _mem_pool = std::make_unique(); int j = _probe_expr_ctxs.size(); for (int i = 0; i < j; ++i) { @@ -330,6 +333,7 @@ Status AggregationNode::prepare(RuntimeState* state) { } Status AggregationNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "aggregator, while execute open"); RETURN_IF_ERROR(ExecNode::open(state)); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -355,7 +359,6 @@ Status AggregationNode::open(RuntimeState* state) { } RETURN_IF_ERROR(_executor.execute(&block)); _executor.update_memusage(); - RETURN_IF_LIMIT_EXCEEDED(state, "aggregator, while execute open."); } return Status::OK(); @@ -366,6 +369,7 @@ Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* } Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_2ARG(mem_tracker(), "aggregator, while execute get_next"); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (_is_streaming_preagg) { @@ -395,12 +399,12 @@ Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) { } _executor.update_memusage(); - RETURN_IF_LIMIT_EXCEEDED(state, "aggregator, while execute get_next."); return Status::OK(); } Status AggregationNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(ExecNode::close(state)); VExpr::close(_probe_expr_ctxs, state); @@ -555,7 +559,7 @@ Status AggregationNode::_merge_without_key(Block* block) { } void AggregationNode::_update_memusage_without_key() { - mem_tracker()->Consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); + _data_mem_tracker->consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); _mem_usage_record.used_in_arena = _agg_arena_pool.size(); } @@ -1078,8 +1082,8 @@ void AggregationNode::_update_memusage_with_serialized_key() { std::visit( [&](auto&& agg_method) -> void { auto& data = agg_method.data; - mem_tracker()->Consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); - mem_tracker()->Consume(data.get_buffer_size_in_bytes() - + _data_mem_tracker->consume(_agg_arena_pool.size() - _mem_usage_record.used_in_arena); + _data_mem_tracker->consume(data.get_buffer_size_in_bytes() - _mem_usage_record.used_in_state); _mem_usage_record.used_in_state = data.get_buffer_size_in_bytes(); _mem_usage_record.used_in_arena = _agg_arena_pool.size(); @@ -1103,7 +1107,7 @@ void AggregationNode::_close_with_serialized_key() { } void AggregationNode::release_tracker() { - mem_tracker()->Release(_mem_usage_record.used_in_state + _mem_usage_record.used_in_arena); + _data_mem_tracker->release(_mem_usage_record.used_in_state + _mem_usage_record.used_in_arena); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/vaggregation_node.h b/be/src/vec/exec/vaggregation_node.h index 4df933f91717f0..358774d09898a6 100644 --- a/be/src/vec/exec/vaggregation_node.h +++ b/be/src/vec/exec/vaggregation_node.h @@ -406,6 +406,8 @@ class AggregationNode : public ::doris::ExecNode { bool _is_merge; std::unique_ptr _mem_pool; + std::shared_ptr _data_mem_tracker; + size_t _align_aggregate_states = 1; /// The offset to the n-th aggregate function in a row of aggregate functions. Sizes _offsets_of_aggregate_states; diff --git a/be/src/vec/exec/vanalytic_eval_node.cpp b/be/src/vec/exec/vanalytic_eval_node.cpp index 4d69716216c7cb..280acfa7e403dc 100644 --- a/be/src/vec/exec/vanalytic_eval_node.cpp +++ b/be/src/vec/exec/vanalytic_eval_node.cpp @@ -22,6 +22,7 @@ #include "runtime/descriptors.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "udf/udf_internal.h" #include "vec/utils/util.hpp" @@ -142,8 +143,9 @@ Status VAnalyticEvalNode::init(const TPlanNode& tnode, RuntimeState* state) { Status VAnalyticEvalNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); DCHECK(child(0)->row_desc().is_prefix_of(row_desc())); - _mem_pool.reset(new MemPool(mem_tracker().get())); + _mem_pool.reset(new MemPool()); _evaluation_timer = ADD_TIMER(runtime_profile(), "EvaluationTime"); SCOPED_TIMER(_evaluation_timer); @@ -207,6 +209,7 @@ Status VAnalyticEvalNode::prepare(RuntimeState* state) { } Status VAnalyticEvalNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_CANCELLED(state); @@ -223,6 +226,7 @@ Status VAnalyticEvalNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); ExecNode::close(state); _destory_agg_status(); return Status::OK(); @@ -233,6 +237,7 @@ Status VAnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, boo } Status VAnalyticEvalNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); diff --git a/be/src/vec/exec/vblocking_join_node.cpp b/be/src/vec/exec/vblocking_join_node.cpp index af1adb957ed7e5..3266b5de204c61 100644 --- a/be/src/vec/exec/vblocking_join_node.cpp +++ b/be/src/vec/exec/vblocking_join_node.cpp @@ -22,6 +22,7 @@ #include "exprs/expr.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris::vectorized { @@ -39,8 +40,9 @@ Status VBlockingJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status VBlockingJoinNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); - _build_pool.reset(new MemPool(mem_tracker().get())); + _build_pool.reset(new MemPool()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _left_child_timer = ADD_TIMER(runtime_profile(), "LeftChildTime"); _build_row_counter = ADD_COUNTER(runtime_profile(), "BuildRows", TUnit::UNIT); @@ -62,11 +64,14 @@ Status VBlockingJoinNode::prepare(RuntimeState* state) { Status VBlockingJoinNode::close(RuntimeState* state) { if (is_closed()) return Status::OK(); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); ExecNode::close(state); return Status::OK(); } void VBlockingJoinNode::build_side_thread(RuntimeState* state, std::promise* status) { + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); status->set_value(construct_build_side(state)); // Release the thread token as soon as possible (before the main thread joins // on it). This way, if we had a chain of 10 joins using 1 additional thread, @@ -75,6 +80,7 @@ void VBlockingJoinNode::build_side_thread(RuntimeState* state, std::promisetotal_time_counter()); RETURN_IF_CANCELLED(state); diff --git a/be/src/vec/exec/vcross_join_node.cpp b/be/src/vec/exec/vcross_join_node.cpp index 6d48527f73df2f..d03ab09dbd48c8 100644 --- a/be/src/vec/exec/vcross_join_node.cpp +++ b/be/src/vec/exec/vcross_join_node.cpp @@ -23,6 +23,7 @@ #include "gen_cpp/PlanNodes_types.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/runtime_profile.h" namespace doris::vectorized { @@ -33,6 +34,8 @@ VCrossJoinNode::VCrossJoinNode(ObjectPool* pool, const TPlanNode& tnode, const D Status VCrossJoinNode::prepare(RuntimeState* state) { DCHECK(_join_op == TJoinOp::CROSS_JOIN); RETURN_IF_ERROR(VBlockingJoinNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _block_mem_tracker = MemTracker::create_virtual_tracker(-1, "VCrossJoinNode:Block", mem_tracker()); _num_existing_columns = child(0)->row_desc().num_materialized_slots(); _num_columns_to_add = child(1)->row_desc().num_materialized_slots(); @@ -44,7 +47,7 @@ Status VCrossJoinNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - _mem_tracker->Release(_total_mem_usage); + _block_mem_tracker->release(_total_mem_usage); VBlockingJoinNode::close(state); return Status::OK(); } @@ -52,6 +55,7 @@ Status VCrossJoinNode::close(RuntimeState* state) { Status VCrossJoinNode::construct_build_side(RuntimeState* state) { // Do a full scan of child(1) and store all build row batches. RETURN_IF_ERROR(child(1)->open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB("Cross join, while getting next from the child 1"); bool eos = false; while (true) { @@ -67,10 +71,8 @@ Status VCrossJoinNode::construct_build_side(RuntimeState* state) { _build_rows += rows; _total_mem_usage += mem_usage; _build_blocks.emplace_back(std::move(block)); - _mem_tracker->Consume(mem_usage); + _block_mem_tracker->consume(mem_usage); } - // to prevent use too many memory - RETURN_IF_LIMIT_EXCEEDED(state, "Cross join, while getting next from the child 1."); if (eos) { break; @@ -89,6 +91,7 @@ void VCrossJoinNode::init_get_next(int left_batch_row) { Status VCrossJoinNode::get_next(RuntimeState* state, Block* block, bool* eos) { RETURN_IF_CANCELLED(state); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); *eos = false; SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/vec/exec/vcross_join_node.h b/be/src/vec/exec/vcross_join_node.h index aeeeb3a2db79bb..1c8998a9f200d4 100644 --- a/be/src/vec/exec/vcross_join_node.h +++ b/be/src/vec/exec/vcross_join_node.h @@ -64,6 +64,8 @@ class VCrossJoinNode final : public VBlockingJoinNode { uint64_t _build_rows = 0; uint64_t _total_mem_usage = 0; + std::shared_ptr _block_mem_tracker; + // Build mutable columns to insert data. // if block can mem reuse, just clear data in block // else build a new block and alloc mem of column from left and right child block diff --git a/be/src/vec/exec/ves_http_scan_node.cpp b/be/src/vec/exec/ves_http_scan_node.cpp index a70acadb89bb2a..cbe2d2097a368e 100644 --- a/be/src/vec/exec/ves_http_scan_node.cpp +++ b/be/src/vec/exec/ves_http_scan_node.cpp @@ -106,7 +106,7 @@ Status VEsHttpScanNode::scanner_scan(std::unique_ptr scanner) { bool scanner_eof = false; const int batch_size = _runtime_state->batch_size(); - std::unique_ptr tuple_pool(new MemPool(mem_tracker().get())); + std::unique_ptr tuple_pool(new MemPool()); size_t slot_num = _tuple_desc->slots().size(); while (!scanner_eof) { diff --git a/be/src/vec/exec/volap_scan_node.cpp b/be/src/vec/exec/volap_scan_node.cpp index 77f02131468cfc..5897de710acd1a 100644 --- a/be/src/vec/exec/volap_scan_node.cpp +++ b/be/src/vec/exec/volap_scan_node.cpp @@ -21,6 +21,7 @@ #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/runtime_filter_mgr.h" +#include "runtime/thread_context.h" #include "util/priority_thread_pool.hpp" #include "vec/core/block.h" #include "vec/exec/volap_scanner.h" @@ -36,6 +37,8 @@ VOlapScanNode::VOlapScanNode(ObjectPool* pool, const TPlanNode& tnode, const Des void VOlapScanNode::transfer_thread(RuntimeState* state) { // scanner open pushdown to scanThread + SCOPED_ATTACH_TASK_THREAD_4ARG(state->query_type(), print_id(state->query_id()), + state->fragment_instance_id(), mem_tracker()); Status status = Status::OK(); if (_vconjunct_ctx_ptr) { @@ -69,7 +72,7 @@ void VOlapScanNode::transfer_thread(RuntimeState* state) { auto block_per_scanner = (doris_scanner_row_num + (block_size - 1)) / block_size; auto pre_block_count = std::min(_volap_scanners.size(), static_cast(config::doris_scanner_thread_pool_thread_num)) * block_per_scanner; - + uint64_t buffered_bytes = 0; for (int i = 0; i < pre_block_count; ++i) { auto block = new Block; for (const auto slot_desc : _tuple_desc->slots()) { @@ -80,9 +83,9 @@ void VOlapScanNode::transfer_thread(RuntimeState* state) { slot_desc->col_name())); } _free_blocks.emplace_back(block); - _buffered_bytes += block->allocated_bytes(); + buffered_bytes += block->allocated_bytes(); } - _mem_tracker->Consume(_buffered_bytes); + _block_mem_tracker->consume(buffered_bytes); // read from scanner while (LIKELY(status.ok())) { @@ -139,6 +142,9 @@ void VOlapScanNode::transfer_thread(RuntimeState* state) { } void VOlapScanNode::scanner_thread(VOlapScanner* scanner) { + SCOPED_ATTACH_TASK_THREAD_4ARG(_runtime_state->query_type(), + print_id(_runtime_state->query_id()), + _runtime_state->fragment_instance_id(), mem_tracker()); int64_t wait_time = scanner->update_wait_worker_timer(); // Do not use ScopedTimer. There is no guarantee that, the counter // (_scan_cpu_timer, the class member) is not destroyed after `_running_thread==0`. @@ -293,6 +299,7 @@ Status VOlapScanNode::start_scan_thread(RuntimeState* state) { _transfer_done = true; return Status::OK(); } + _block_mem_tracker = MemTracker::create_virtual_tracker(-1, "VOlapScanNode:Block"); // ranges constructed from scan keys std::vector> cond_ranges; @@ -337,7 +344,7 @@ Status VOlapScanNode::start_scan_thread(RuntimeState* state) { } VOlapScanner* scanner = new VOlapScanner(state, this, _olap_scan_node.is_preaggregation, - _need_agg_finalize, *scan_range); + _need_agg_finalize, *scan_range, scanner_mem_tracker); // add scanner to pool before doing prepare. // so that scanner can be automatically deconstructed if prepare failed. _scanner_pool.add(scanner); @@ -366,6 +373,7 @@ Status VOlapScanNode::close(RuntimeState* state) { return Status::OK(); } RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); // change done status { @@ -386,7 +394,6 @@ Status VOlapScanNode::close(RuntimeState* state) { std::for_each(_materialized_blocks.begin(), _materialized_blocks.end(), std::default_delete()); std::for_each(_scan_blocks.begin(), _scan_blocks.end(), std::default_delete()); std::for_each(_free_blocks.begin(), _free_blocks.end(), std::default_delete()); - _mem_tracker->Release(_buffered_bytes); // OlapScanNode terminate by exception // so that initiative close the Scanner @@ -406,6 +413,7 @@ Status VOlapScanNode::close(RuntimeState* state) { } Status VOlapScanNode::get_next(RuntimeState* state, Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/vec/exec/volap_scan_node.h b/be/src/vec/exec/volap_scan_node.h index 921399ee6bd5b1..09f77364eda279 100644 --- a/be/src/vec/exec/volap_scan_node.h +++ b/be/src/vec/exec/volap_scan_node.h @@ -63,6 +63,8 @@ class VOlapScanNode final : public OlapScanNode { std::list _volap_scanners; std::mutex _volap_scanners_lock; + std::shared_ptr _block_mem_tracker; + int _max_materialized_blocks; }; } // namespace vectorized diff --git a/be/src/vec/exec/volap_scanner.cpp b/be/src/vec/exec/volap_scanner.cpp index 7b5b31eb52d770..b2ec142dcb5f24 100644 --- a/be/src/vec/exec/volap_scanner.cpp +++ b/be/src/vec/exec/volap_scanner.cpp @@ -30,17 +30,20 @@ #include "vec/exec/volap_scan_node.h" #include "vec/exprs/vexpr_context.h" #include "vec/runtime/vdatetime_value.h" +#include "runtime/thread_context.h" namespace doris::vectorized { VOlapScanner::VOlapScanner(RuntimeState* runtime_state, VOlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range) - : OlapScanner(runtime_state, parent, aggregation, need_agg_finalize, scan_range) { + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker) + : OlapScanner(runtime_state, parent, aggregation, need_agg_finalize, scan_range, tracker) { } Status VOlapScanner::get_block(RuntimeState* state, vectorized::Block* block, bool* eof) { // only empty block should be here DCHECK(block->rows() == 0); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int64_t raw_rows_threshold = raw_rows_read() + config::doris_scanner_row_num; if (!block->mem_reuse()) { diff --git a/be/src/vec/exec/volap_scanner.h b/be/src/vec/exec/volap_scanner.h index 0c1c4adf854aee..b6ef7e32ff8250 100644 --- a/be/src/vec/exec/volap_scanner.h +++ b/be/src/vec/exec/volap_scanner.h @@ -33,7 +33,8 @@ class VOlapScanNode; class VOlapScanner : public OlapScanner { public: VOlapScanner(RuntimeState* runtime_state, VOlapScanNode* parent, bool aggregation, - bool need_agg_finalize, const TPaloScanRange& scan_range); + bool need_agg_finalize, const TPaloScanRange& scan_range, + std::shared_ptr tracker); Status get_block(RuntimeState* state, vectorized::Block* block, bool* eof); diff --git a/be/src/vec/exec/vset_operation_node.cpp b/be/src/vec/exec/vset_operation_node.cpp index 3e6f73dae7226d..0be24a3ae5f6a7 100644 --- a/be/src/vec/exec/vset_operation_node.cpp +++ b/be/src/vec/exec/vset_operation_node.cpp @@ -17,6 +17,7 @@ #include "vec/exec/vset_operation_node.h" +#include "runtime/thread_context.h" #include "util/defer_op.h" #include "vec/exprs/vexpr.h" namespace doris { @@ -36,10 +37,10 @@ struct HashTableBuild { using KeyGetter = typename HashTableContext::State; using Mapped = typename HashTableContext::Mapped; int64_t old_bucket_bytes = hash_table_ctx.hash_table.get_buffer_size_in_bytes(); - + Defer defer {[&]() { int64_t bucket_bytes = hash_table_ctx.hash_table.get_buffer_size_in_bytes(); - _operation_node->_mem_tracker->Consume(bucket_bytes - old_bucket_bytes); + _operation_node->_hash_table_mem_tracker->consume(bucket_bytes - old_bucket_bytes); _operation_node->_mem_used += bucket_bytes - old_bucket_bytes; }}; @@ -80,10 +81,11 @@ Status VSetOperationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); for (auto& exprs : _child_expr_lists) { VExpr::close(exprs, state); } - _mem_tracker->Release(_mem_used); + _hash_table_mem_tracker->release(_mem_used); return ExecNode::close(state); } @@ -111,6 +113,7 @@ Status VSetOperationNode::init(const TPlanNode& tnode, RuntimeState* state) { } Status VSetOperationNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); // open result expr lists. @@ -123,6 +126,8 @@ Status VSetOperationNode::open(RuntimeState* state) { Status VSetOperationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _hash_table_mem_tracker = MemTracker::create_virtual_tracker(-1, "VSetOperationNode:HashTable"); SCOPED_TIMER(_runtime_profile->total_time_counter()); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _probe_timer = ADD_TIMER(runtime_profile(), "ProbeTime"); @@ -224,6 +229,8 @@ void VSetOperationNode::hash_table_init() { //build a hash table from child(0) Status VSetOperationNode::hash_table_build(RuntimeState* state) { RETURN_IF_ERROR(child(0)->open(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_CB( + "Set Operation Node, while constructing the hash table"); Block block; bool eos = false; while (!eos) { @@ -233,12 +240,9 @@ Status VSetOperationNode::hash_table_build(RuntimeState* state) { RETURN_IF_ERROR(child(0)->get_next(state, &block, &eos)); size_t allocated_bytes = block.allocated_bytes(); - _mem_tracker->Consume(allocated_bytes); + _hash_table_mem_tracker->consume(allocated_bytes); _mem_used += allocated_bytes; - - RETURN_IF_LIMIT_EXCEEDED(state, "Set Operation Node, while getting next from the child 0."); RETURN_IF_ERROR(process_build_block(block)); - RETURN_IF_LIMIT_EXCEEDED(state, "Set Operation Node, while constructing the hash table."); } return Status::OK(); } diff --git a/be/src/vec/exec/vset_operation_node.h b/be/src/vec/exec/vset_operation_node.h index 93a7f8b61a3825..ea2eb7c8ea0331 100644 --- a/be/src/vec/exec/vset_operation_node.h +++ b/be/src/vec/exec/vset_operation_node.h @@ -89,6 +89,8 @@ class VSetOperationNode : public ExecNode { RuntimeProfile::Counter* _build_timer; // time to build hash table RuntimeProfile::Counter* _probe_timer; // time to probe + std::shared_ptr _hash_table_mem_tracker; + template friend class HashTableBuild; template diff --git a/be/src/vec/exec/vsort_node.cpp b/be/src/vec/exec/vsort_node.cpp index 734af91baac45a..fd93977ea86118 100644 --- a/be/src/vec/exec/vsort_node.cpp +++ b/be/src/vec/exec/vsort_node.cpp @@ -20,6 +20,7 @@ #include "exec/sort_exec_exprs.h" #include "runtime/row_batch.h" #include "runtime/runtime_state.h" +#include "runtime/thread_context.h" #include "util/debug_util.h" #include "vec/core/sort_block.h" @@ -43,12 +44,15 @@ Status VSortNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); _runtime_profile->add_info_string("TOP-N", _limit == -1 ? "false" : "true"); RETURN_IF_ERROR(ExecNode::prepare(state)); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _block_mem_tracker = MemTracker::create_virtual_tracker(-1, "VSortNode:Block", mem_tracker()); RETURN_IF_ERROR(_vsort_exec_exprs.prepare(state, child(0)->row_desc(), _row_descriptor, expr_mem_tracker())); return Status::OK(); } Status VSortNode::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(_vsort_exec_exprs.open(state)); @@ -74,6 +78,7 @@ Status VSortNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) } Status VSortNode::get_next(RuntimeState* state, Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); SCOPED_TIMER(_runtime_profile->total_time_counter()); auto status = Status::OK(); @@ -102,7 +107,8 @@ Status VSortNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } - _mem_tracker->Release(_total_mem_usage); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(mem_tracker()); + _block_mem_tracker->release(_total_mem_usage); _vsort_exec_exprs.close(state); ExecNode::close(state); return Status::OK(); @@ -159,7 +165,7 @@ Status VSortNode::sort_input(RuntimeState* state) { _sorted_blocks.emplace_back(std::move(block)); } - _mem_tracker->Consume(mem_usage); + _block_mem_tracker->consume(mem_usage); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(state->check_query_state("vsort, while sorting input.")); } diff --git a/be/src/vec/exec/vsort_node.h b/be/src/vec/exec/vsort_node.h index 66876aa149d4e4..f3e39ca1d34d59 100644 --- a/be/src/vec/exec/vsort_node.h +++ b/be/src/vec/exec/vsort_node.h @@ -85,6 +85,8 @@ class VSortNode : public doris::ExecNode { // only valid in TOP-N node uint64_t _num_rows_in_block = 0; std::priority_queue _block_priority_queue; + + std::shared_ptr _block_mem_tracker; }; } // end namespace doris diff --git a/be/src/vec/exprs/vexpr_context.cpp b/be/src/vec/exprs/vexpr_context.cpp index a8f1d5291f7023..113abac69cfaf3 100644 --- a/be/src/vec/exprs/vexpr_context.cpp +++ b/be/src/vec/exprs/vexpr_context.cpp @@ -19,6 +19,7 @@ #include "udf/udf_internal.h" #include "vec/exprs/vexpr.h" +#include "runtime/thread_context.h" namespace doris::vectorized { VExprContext::VExprContext(VExpr* expr) @@ -39,7 +40,9 @@ doris::Status VExprContext::prepare(doris::RuntimeState* state, const doris::RowDescriptor& row_desc, const std::shared_ptr& tracker) { _prepared = true; - _pool.reset(new MemPool(state->instance_mem_tracker().get())); + _mem_tracker = tracker; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + _pool.reset(new MemPool()); return _root->prepare(state, row_desc, this); } @@ -48,6 +51,7 @@ doris::Status VExprContext::open(doris::RuntimeState* state) { if (_opened) { return Status::OK(); } + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); _opened = true; // Fragment-local state is only initialized for original contexts. Clones inherit the // original's fragment state and only need to have thread-local state initialized. @@ -58,6 +62,7 @@ doris::Status VExprContext::open(doris::RuntimeState* state) { void VExprContext::close(doris::RuntimeState* state) { DCHECK(!_closed); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); FunctionContext::FunctionStateScope scope = _is_clone ? FunctionContext::THREAD_LOCAL : FunctionContext::FRAGMENT_LOCAL; _root->close(state, this, scope); @@ -76,9 +81,10 @@ doris::Status VExprContext::clone(RuntimeState* state, VExprContext** new_ctx) { DCHECK(_prepared); DCHECK(_opened); DCHECK(*new_ctx == nullptr); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); *new_ctx = state->obj_pool()->add(new VExprContext(_root)); - (*new_ctx)->_pool.reset(new MemPool(_pool->mem_tracker())); + (*new_ctx)->_pool.reset(new MemPool()); for (auto& _fn_context : _fn_contexts) { (*new_ctx)->_fn_contexts.push_back(_fn_context->impl()->clone((*new_ctx)->_pool.get())); } @@ -86,6 +92,7 @@ doris::Status VExprContext::clone(RuntimeState* state, VExprContext** new_ctx) { (*new_ctx)->_is_clone = true; (*new_ctx)->_prepared = true; (*new_ctx)->_opened = true; + (*new_ctx)->_mem_tracker = _mem_tracker; return _root->open(state, *new_ctx, FunctionContext::THREAD_LOCAL); } diff --git a/be/src/vec/exprs/vexpr_context.h b/be/src/vec/exprs/vexpr_context.h index 0021779b35b7da..2df377d770a5be 100644 --- a/be/src/vec/exprs/vexpr_context.h +++ b/be/src/vec/exprs/vexpr_context.h @@ -83,6 +83,8 @@ class VExprContext { /// and owned by this VExprContext. std::vector _fn_contexts; + std::shared_ptr _mem_tracker; + /// Pool backing fn_contexts_. Counts against the runtime state's UDF mem tracker. std::unique_ptr _pool; diff --git a/be/src/vec/olap/vgeneric_iterators.cpp b/be/src/vec/olap/vgeneric_iterators.cpp index f0f148da2d77e0..e3d853ac3a35de 100644 --- a/be/src/vec/olap/vgeneric_iterators.cpp +++ b/be/src/vec/olap/vgeneric_iterators.cpp @@ -259,10 +259,7 @@ Status VMergeIteratorContext::_load_next_block() { class VMergeIterator : public RowwiseIterator { public: // VMergeIterator takes the ownership of input iterators - VMergeIterator(std::vector& iters, std::shared_ptr parent) : _origin_iters(iters) { - // use for count the mem use of Block use in Merge - _mem_tracker = MemTracker::CreateTracker(-1, "VMergeIterator", parent, false); - } + VMergeIterator(std::vector& iters) : _origin_iters(iters) {} ~VMergeIterator() override { while (!_merge_heap.empty()) { @@ -350,10 +347,7 @@ class VUnionIterator : public RowwiseIterator { // Iterators' ownership it transfered to this class. // This class will delete all iterators when destructs // Client should not use iterators any more. - VUnionIterator(std::vector& v, std::shared_ptr parent) - : _origin_iters(v.begin(), v.end()) { - _mem_tracker = MemTracker::CreateTracker(-1, "VUnionIterator", parent, false); - } + VUnionIterator(std::vector& v) : _origin_iters(v.begin(), v.end()) {} ~VUnionIterator() override { std::for_each(_origin_iters.begin(), _origin_iters.end(), std::default_delete()); @@ -403,18 +397,18 @@ Status VUnionIterator::next_batch(vectorized::Block* block) { } -RowwiseIterator* new_merge_iterator(std::vector& inputs, std::shared_ptr parent) { +RowwiseIterator* new_merge_iterator(std::vector& inputs) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new VMergeIterator(inputs, parent); + return new VMergeIterator(inputs); } -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent) { +RowwiseIterator* new_union_iterator(std::vector& inputs) { if (inputs.size() == 1) { return *(inputs.begin()); } - return new VUnionIterator(inputs, parent); + return new VUnionIterator(inputs); } RowwiseIterator* new_auto_increment_iterator(const Schema& schema, size_t num_rows) { diff --git a/be/src/vec/olap/vgeneric_iterators.h b/be/src/vec/olap/vgeneric_iterators.h index 8177a63f8b00e2..eb2dade1be0e6d 100644 --- a/be/src/vec/olap/vgeneric_iterators.h +++ b/be/src/vec/olap/vgeneric_iterators.h @@ -27,14 +27,14 @@ namespace vectorized { // // Inputs iterators' ownership is taken by created merge iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_merge_iterator(std::vector& inputs, std::shared_ptr parent); +RowwiseIterator* new_merge_iterator(std::vector& inputs); // Create a union iterator for input iterators. Union iterator will read // input iterators one by one. // // Inputs iterators' ownership is taken by created union iterator. And client // should delete returned iterator after usage. -RowwiseIterator* new_union_iterator(std::vector& inputs, std::shared_ptr parent); +RowwiseIterator* new_union_iterator(std::vector& inputs); // Create an auto increment iterator which returns num_rows data in format of schema. // This class aims to be used in unit test. diff --git a/be/src/vec/runtime/vdata_stream_mgr.cpp b/be/src/vec/runtime/vdata_stream_mgr.cpp index 4b0bb5f75c334a..b26d11cd6bd7b4 100644 --- a/be/src/vec/runtime/vdata_stream_mgr.cpp +++ b/be/src/vec/runtime/vdata_stream_mgr.cpp @@ -53,7 +53,7 @@ std::shared_ptr VDataStreamMgr::create_recvr( VLOG_FILE << "creating receiver for fragment=" << fragment_instance_id << ", node=" << dest_node_id; std::shared_ptr recvr(new VDataStreamRecvr( - this, state->instance_mem_tracker(), row_desc, fragment_instance_id, dest_node_id, + this, row_desc, fragment_instance_id, dest_node_id, num_senders, is_merging, buffer_size, profile, sub_plan_query_statistics_recvr)); uint32_t hash_value = get_hash_value(fragment_instance_id, dest_node_id); std::lock_guard l(_lock); diff --git a/be/src/vec/runtime/vdata_stream_recvr.cpp b/be/src/vec/runtime/vdata_stream_recvr.cpp index 59e18d07a13e38..9f52fbe333f103 100644 --- a/be/src/vec/runtime/vdata_stream_recvr.cpp +++ b/be/src/vec/runtime/vdata_stream_recvr.cpp @@ -19,6 +19,7 @@ #include "gen_cpp/data.pb.h" #include "runtime/mem_tracker.h" +#include "runtime/thread_context.h" #include "util/uid_util.h" #include "vec/core/block.h" #include "vec/core/sort_cursor.h" @@ -121,7 +122,7 @@ void VDataStreamRecvr::SenderQueue::add_block(const PBlock& pblock, int be_numbe SCOPED_TIMER(_recvr->_deserialize_row_batch_timer); block = new Block(pblock); } - _recvr->_mem_tracker->Consume(block->bytes()); + _recvr->_block_mem_tracker->consume(block->bytes()); VLOG_ROW << "added #rows=" << block->rows() << " batch_size=" << block_byte_size << "\n"; _block_queue.emplace_back(block_byte_size, block); @@ -158,7 +159,7 @@ void VDataStreamRecvr::SenderQueue::add_block(Block* block, bool use_move) { size_t block_size = nblock->bytes(); _block_queue.emplace_back(block_size, nblock); - _recvr->_mem_tracker->Consume(nblock->bytes()); + _recvr->_block_mem_tracker->consume(nblock->bytes()); _data_arrival_cv.notify_one(); if (_recvr->exceeds_limit(block_size)) { @@ -241,10 +242,9 @@ void VDataStreamRecvr::SenderQueue::close() { } VDataStreamRecvr::VDataStreamRecvr( - VDataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, int total_buffer_limit, - RuntimeProfile* profile, + VDataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, int num_senders, + bool is_merging, int total_buffer_limit, RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr) : _mgr(stream_mgr), _fragment_instance_id(fragment_instance_id), @@ -256,8 +256,13 @@ VDataStreamRecvr::VDataStreamRecvr( _num_buffered_bytes(0), _profile(profile), _sub_plan_query_statistics_recvr(sub_plan_query_statistics_recvr) { - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "VDataStreamRecvr:" + print_id(_fragment_instance_id), parent_tracker); + _mem_tracker = + MemTracker::create_tracker(-1, "VDataStreamRecvr:" + print_id(_fragment_instance_id), + nullptr, MemTrackerLevel::VERBOSE, _profile); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); + _block_mem_tracker = MemTracker::create_virtual_tracker( + -1, "VDataStreamRecvr:block:" + print_id(_fragment_instance_id), nullptr, + MemTrackerLevel::VERBOSE); // Create one queue per sender if is_merging is true. int num_queues = is_merging ? num_senders : 1; @@ -287,6 +292,7 @@ Status VDataStreamRecvr::create_merger(const std::vector& orderin const std::vector& nulls_first, size_t batch_size, int64_t limit, size_t offset) { DCHECK(_is_merging); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); std::vector child_block_suppliers; // Create the merger that will a single stream of sorted rows. _merger.reset(new VSortedRunMerger(ordering_expr, is_asc_order, nulls_first, batch_size, limit, @@ -302,16 +308,19 @@ Status VDataStreamRecvr::create_merger(const std::vector& orderin void VDataStreamRecvr::add_block(const PBlock& pblock, int sender_id, int be_number, int64_t packet_seq, ::google::protobuf::Closure** done) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->add_block(pblock, be_number, packet_seq, done); } void VDataStreamRecvr::add_block(Block* block, int sender_id, bool use_move) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->add_block(block, use_move); } Status VDataStreamRecvr::get_next(Block* block, bool* eos) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (!_is_merging) { Block* res = nullptr; RETURN_IF_ERROR(_sender_queues[0]->get_batch(&res)); @@ -325,15 +334,16 @@ Status VDataStreamRecvr::get_next(Block* block, bool* eos) { RETURN_IF_ERROR(_merger->get_next(block, eos)); } - if (LIKELY(_mem_tracker->consumption() >= block->bytes())) { - _mem_tracker->Release(block->bytes()); + if (LIKELY(_block_mem_tracker->consumption() >= block->bytes())) { + _block_mem_tracker->release(block->bytes()); } else { - _mem_tracker->Release(_mem_tracker->consumption()); + _block_mem_tracker->release(_block_mem_tracker->consumption()); } return Status::OK(); } void VDataStreamRecvr::remove_sender(int sender_id, int be_number) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); int use_sender_id = _is_merging ? sender_id : 0; _sender_queues[use_sender_id]->decrement_senders(be_number); } @@ -349,6 +359,7 @@ void VDataStreamRecvr::close() { return; } _is_closed = true; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); for (int i = 0; i < _sender_queues.size(); ++i) { _sender_queues[i]->close(); } @@ -358,7 +369,7 @@ void VDataStreamRecvr::close() { _mgr = nullptr; _merger.reset(); - _mem_tracker->Release(_mem_tracker->consumption()); + _block_mem_tracker->release(_block_mem_tracker->consumption()); } } // namespace doris::vectorized diff --git a/be/src/vec/runtime/vdata_stream_recvr.h b/be/src/vec/runtime/vdata_stream_recvr.h index 1292b978b52dcb..0b8c188a1bf547 100644 --- a/be/src/vec/runtime/vdata_stream_recvr.h +++ b/be/src/vec/runtime/vdata_stream_recvr.h @@ -50,10 +50,10 @@ class VExprContext; class VDataStreamRecvr { public: - VDataStreamRecvr(VDataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker, - const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id, - PlanNodeId dest_node_id, int num_senders, bool is_merging, - int total_buffer_limit, RuntimeProfile* profile, + VDataStreamRecvr(VDataStreamMgr* stream_mgr, const RowDescriptor& row_desc, + const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, + int num_senders, bool is_merging, int total_buffer_limit, + RuntimeProfile* profile, std::shared_ptr sub_plan_query_statistics_recvr); ~VDataStreamRecvr(); @@ -73,7 +73,6 @@ class VDataStreamRecvr { const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } PlanNodeId dest_node_id() const { return _dest_node_id; } const RowDescriptor& row_desc() const { return _row_desc; } - std::shared_ptr mem_tracker() const { return _mem_tracker; } void add_sub_plan_statistics(const PQueryStatistics& statistics, int sender_id) { _sub_plan_query_statistics_recvr->insert(statistics, sender_id); @@ -117,6 +116,7 @@ class VDataStreamRecvr { std::atomic _num_buffered_bytes; std::shared_ptr _mem_tracker; + std::shared_ptr _block_mem_tracker; std::vector _sender_queues; std::unique_ptr _merger; diff --git a/be/src/vec/sink/vdata_stream_sender.cpp b/be/src/vec/sink/vdata_stream_sender.cpp index 8819eaa2362baa..a8031301c7fa8a 100644 --- a/be/src/vec/sink/vdata_stream_sender.cpp +++ b/be/src/vec/sink/vdata_stream_sender.cpp @@ -28,6 +28,7 @@ #include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" #include "util/proto_util.h" +#include "runtime/thread_context.h" #include "vec/common/sip_hash.h" #include "vec/runtime/vdata_stream_mgr.h" #include "vec/runtime/vdata_stream_recvr.h" @@ -343,9 +344,10 @@ Status VDataStreamSender::prepare(RuntimeState* state) { _dest_node_id, instances); _profile = _pool->add(new RuntimeProfile(std::move(title))); SCOPED_TIMER(_profile->total_time_counter()); - _mem_tracker = MemTracker::CreateTracker( - _profile, -1, "VDataStreamSender:" + print_id(state->fragment_instance_id()), - state->instance_mem_tracker()); + _mem_tracker = MemTracker::create_tracker( + -1, "VDataStreamSender:" + print_id(state->fragment_instance_id()), nullptr, + MemTrackerLevel::VERBOSE, _profile); + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM) { std::random_device rd; @@ -378,6 +380,7 @@ Status VDataStreamSender::prepare(RuntimeState* state) { } Status VDataStreamSender::open(RuntimeState* state) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); DCHECK(state != nullptr); RETURN_IF_ERROR(VExpr::open(_partition_expr_ctxs, state)); for (auto iter : _partition_infos) { @@ -391,6 +394,7 @@ Status VDataStreamSender::send(RuntimeState* state, RowBatch* batch) { } Status VDataStreamSender::send(RuntimeState* state, Block* block) { + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); SCOPED_TIMER(_profile->total_time_counter()); if (_part_type == TPartitionType::UNPARTITIONED || _channels.size() == 1) { // 1. serialize depends on it is not local exchange @@ -501,6 +505,7 @@ Status VDataStreamSender::send(RuntimeState* state, Block* block) { Status VDataStreamSender::close(RuntimeState* state, Status exec_status) { if (_closed) return Status::OK(); _closed = true; + SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_1ARG(_mem_tracker); Status final_st = Status::OK(); for (int i = 0; i < _channels.size(); ++i) { diff --git a/be/test/exec/hash_table_test.cpp b/be/test/exec/hash_table_test.cpp index 3502d0b9d104bf..52e1b1f08ab929 100644 --- a/be/test/exec/hash_table_test.cpp +++ b/be/test/exec/hash_table_test.cpp @@ -47,8 +47,8 @@ namespace doris { class HashTableTest : public testing::Test { public: HashTableTest() { - _tracker = MemTracker::CreateTracker(-1, "root"); - _pool_tracker = MemTracker::CreateTracker(-1, "mem-pool", _tracker); + _tracker = MemTracker::create_tracker(-1, "root"); + _pool_tracker = MemTracker::create_tracker(-1, "mem-pool", _tracker); _mem_pool.reset(new MemPool(_pool_tracker.get())); _state = _pool.add(new RuntimeState(TQueryGlobals())); _state->init_instance_mem_tracker(); @@ -196,7 +196,7 @@ TEST_F(HashTableTest, SetupTest) { // The hash table is rehashed a few times and the scans/finds are tested again. TEST_F(HashTableTest, BasicTest) { std::shared_ptr hash_table_tracker = - MemTracker::CreateTracker(-1, "hash-table-basic-tracker", _tracker); + MemTracker::create_tracker(-1, "hash-table-basic-tracker", _tracker); TupleRow* build_rows[5]; TupleRow* scan_rows[5] = {0}; @@ -260,7 +260,7 @@ TEST_F(HashTableTest, BasicTest) { // This tests makes sure we can scan ranges of buckets TEST_F(HashTableTest, ScanTest) { std::shared_ptr hash_table_tracker = - MemTracker::CreateTracker(-1, "hash-table-scan-tracker", _tracker); + MemTracker::create_tracker(-1, "hash-table-scan-tracker", _tracker); std::vector is_null_safe = {false}; int initial_seed = 1; @@ -314,7 +314,7 @@ TEST_F(HashTableTest, GrowTableTest) { int expected_size = 0; std::shared_ptr mem_tracker = - MemTracker::CreateTracker(1024 * 1024, "hash-table-grow-tracker", _tracker); + MemTracker::create_tracker(1024 * 1024, "hash-table-grow-tracker", _tracker); std::vector is_null_safe = {false}; int initial_seed = 1; int64_t num_buckets = 4; @@ -357,7 +357,7 @@ TEST_F(HashTableTest, GrowTableTest2) { int expected_size = 0; std::shared_ptr mem_tracker = - MemTracker::CreateTracker(1024 * 1024 * 1024, "hash-table-grow2-tracker", _tracker); + MemTracker::create_tracker(1024 * 1024 * 1024, "hash-table-grow2-tracker", _tracker); std::vector is_null_safe = {false}; int initial_seed = 1; int64_t num_buckets = 4; diff --git a/be/test/exec/tablet_sink_test.cpp b/be/test/exec/tablet_sink_test.cpp index 3d55699a6f6a8c..e59e972097ff78 100644 --- a/be/test/exec/tablet_sink_test.cpp +++ b/be/test/exec/tablet_sink_test.cpp @@ -57,6 +57,7 @@ class OlapTableSinkTest : public testing::Test { _env->_internal_client_cache = new BrpcClientCache(); _env->_function_client_cache = new BrpcClientCache(); _env->_buffer_reservation = new ReservationTracker(); + _env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); ThreadPoolBuilder("SendBatchThreadPool") .set_min_threads(1) .set_max_threads(5) diff --git a/be/test/exprs/bloom_filter_predicate_test.cpp b/be/test/exprs/bloom_filter_predicate_test.cpp index ca6e5a9f0d4795..1cba866e7d1fe8 100644 --- a/be/test/exprs/bloom_filter_predicate_test.cpp +++ b/be/test/exprs/bloom_filter_predicate_test.cpp @@ -31,7 +31,7 @@ class BloomFilterPredicateTest : public testing::Test { }; TEST_F(BloomFilterPredicateTest, bloom_filter_func_int_test) { - auto tracker = MemTracker::CreateTracker(); + auto tracker = MemTracker::create_tracker(); std::unique_ptr func( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_INT)); ASSERT_TRUE(func->init(1024, 0.05).ok()); @@ -53,7 +53,7 @@ TEST_F(BloomFilterPredicateTest, bloom_filter_func_int_test) { } TEST_F(BloomFilterPredicateTest, bloom_filter_func_stringval_test) { - auto tracker = MemTracker::CreateTracker(); + auto tracker = MemTracker::create_tracker(); std::unique_ptr func( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_VARCHAR)); ASSERT_TRUE(func->init(1024, 0.05).ok()); @@ -104,7 +104,7 @@ TEST_F(BloomFilterPredicateTest, bloom_filter_func_stringval_test) { } TEST_F(BloomFilterPredicateTest, bloom_filter_size_test) { - auto tracker = MemTracker::CreateTracker(); + auto tracker = MemTracker::create_tracker(); std::unique_ptr func( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_VARCHAR)); int length = 4096; diff --git a/be/test/olap/bloom_filter_column_predicate_test.cpp b/be/test/olap/bloom_filter_column_predicate_test.cpp index 24abea12151ea9..7921fc88de9dda 100644 --- a/be/test/olap/bloom_filter_column_predicate_test.cpp +++ b/be/test/olap/bloom_filter_column_predicate_test.cpp @@ -95,7 +95,7 @@ TEST_F(TestBloomFilterColumnPredicate, FLOAT_COLUMN) { return_columns.push_back(i); } - auto tracker = MemTracker::CreateTracker(-1, "OlapScanner"); + auto tracker = MemTracker::create_tracker(-1, "OlapScanner"); std::shared_ptr bloom_filter( create_bloom_filter(tracker.get(), PrimitiveType::TYPE_FLOAT)); diff --git a/be/test/olap/generic_iterators_test.cpp b/be/test/olap/generic_iterators_test.cpp index b73ad0271e6531..59bff103ba53bc 100644 --- a/be/test/olap/generic_iterators_test.cpp +++ b/be/test/olap/generic_iterators_test.cpp @@ -83,8 +83,8 @@ TEST(GenericIteratorsTest, Union) { inputs.push_back(new_auto_increment_iterator(schema, 200)); inputs.push_back(new_auto_increment_iterator(schema, 300)); - auto iter = new_union_iterator(inputs, - MemTracker::CreateTracker(-1, "UnionIterator", nullptr, false)); + auto iter = new_union_iterator(std::move(inputs), + MemTracker::create_tracker(-1, "UnionIterator", nullptr)); StorageReadOptions opts; auto st = iter->init(opts); ASSERT_TRUE(st.ok()); @@ -124,7 +124,7 @@ TEST(GenericIteratorsTest, Merge) { inputs.push_back(new_auto_increment_iterator(schema, 300)); auto iter = new_merge_iterator( - std::move(inputs), MemTracker::CreateTracker(-1, "MergeIterator", nullptr, false), -1); + std::move(inputs), MemTracker::create_tracker(-1, "MergeIterator", nullptr), -1); StorageReadOptions opts; auto st = iter->init(opts); ASSERT_TRUE(st.ok()); diff --git a/be/test/runtime/buffered_tuple_stream2_test.cpp b/be/test/runtime/buffered_tuple_stream2_test.cpp index e31d935fc1504e..4a522ad84fda4e 100644 --- a/be/test/runtime/buffered_tuple_stream2_test.cpp +++ b/be/test/runtime/buffered_tuple_stream2_test.cpp @@ -117,7 +117,7 @@ class SimpleTupleStreamTest : public testing::Test { } virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) { - RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows, _tracker.get())); + RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows)); int tuple_size = _int_desc->tuple_descriptors()[0]->byte_size(); uint8_t* tuple_mem = reinterpret_cast( batch->tuple_data_pool()->allocate(tuple_size * num_rows)); @@ -146,7 +146,7 @@ class SimpleTupleStreamTest : public testing::Test { virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) { int tuple_size = sizeof(StringValue) + 1; - RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows, _tracker.get())); + RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows)); uint8_t* tuple_mem = batch->tuple_data_pool()->allocate(tuple_size * num_rows); memset(tuple_mem, 0, tuple_size * num_rows); const int string_tuples = _string_desc->tuple_descriptors().size(); diff --git a/be/test/runtime/fragment_mgr_test.cpp b/be/test/runtime/fragment_mgr_test.cpp index ffe11b44eb2c82..304d6d8f77db8c 100644 --- a/be/test/runtime/fragment_mgr_test.cpp +++ b/be/test/runtime/fragment_mgr_test.cpp @@ -47,7 +47,8 @@ Status PlanFragmentExecutor::open() { return s_open_status; } -void PlanFragmentExecutor::cancel() {} +void PlanFragmentExecutor::cancel(const PPlanFragmentCancelReason& reason, const std::string& msg) { +} void PlanFragmentExecutor::set_abort() { LOG(INFO) << "Plan Aborted"; diff --git a/be/test/runtime/mem_limit_test.cpp b/be/test/runtime/mem_limit_test.cpp index b2c4017ea1e47f..378b9c5d083e5b 100644 --- a/be/test/runtime/mem_limit_test.cpp +++ b/be/test/runtime/mem_limit_test.cpp @@ -24,121 +24,121 @@ namespace doris { TEST(MemTrackerTest, SingleTrackerNoLimit) { - auto t = MemTracker::CreateTracker(); + auto t = MemTracker::create_tracker(); EXPECT_FALSE(t->has_limit()); - t->Consume(10); + t->consume(10); EXPECT_EQ(t->consumption(), 10); - t->Consume(10); + t->consume(10); EXPECT_EQ(t->consumption(), 20); - t->Release(15); + t->release(15); EXPECT_EQ(t->consumption(), 5); - EXPECT_FALSE(t->LimitExceeded(MemLimit::HARD)); - t->Release(5); + EXPECT_FALSE(t->limit_exceeded()); + t->release(5); } TEST(MemTestTest, SingleTrackerWithLimit) { - auto t = MemTracker::CreateTracker(11, "limit tracker"); + auto t = MemTracker::create_tracker(11, "limit tracker"); EXPECT_TRUE(t->has_limit()); - t->Consume(10); + t->consume(10); EXPECT_EQ(t->consumption(), 10); - EXPECT_FALSE(t->LimitExceeded(MemLimit::HARD)); - t->Consume(10); + EXPECT_FALSE(t->limit_exceeded()); + t->consume(10); EXPECT_EQ(t->consumption(), 20); - EXPECT_TRUE(t->LimitExceeded(MemLimit::HARD)); - t->Release(15); + EXPECT_TRUE(t->limit_exceeded()); + t->release(15); EXPECT_EQ(t->consumption(), 5); - EXPECT_FALSE(t->LimitExceeded(MemLimit::HARD)); - t->Release(5); + EXPECT_FALSE(t->limit_exceeded()); + t->release(5); } TEST(MemTestTest, TrackerHierarchy) { - auto p = MemTracker::CreateTracker(100); - auto c1 = MemTracker::CreateTracker(80, "c1", p); - auto c2 = MemTracker::CreateTracker(50, "c2", p); + auto p = MemTracker::create_tracker(100); + auto c1 = MemTracker::create_tracker(80, "c1", p); + auto c2 = MemTracker::create_tracker(50, "c2", p); // everything below limits - c1->Consume(60); + c1->consume(60); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 0); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 60); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); + EXPECT_FALSE(p->any_limit_exceeded()); // p goes over limit - c2->Consume(50); + c2->consume(50); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_TRUE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_TRUE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 50); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_TRUE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_TRUE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 110); - EXPECT_TRUE(p->LimitExceeded(MemLimit::HARD)); + EXPECT_TRUE(p->limit_exceeded()); // c2 goes over limit, p drops below limit - c1->Release(20); - c2->Consume(10); + c1->release(20); + c2->consume(10); EXPECT_EQ(c1->consumption(), 40); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 60); - EXPECT_TRUE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_TRUE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_TRUE(c2->limit_exceeded()); + EXPECT_TRUE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 100); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - c1->Release(40); - c2->Release(60); + EXPECT_FALSE(p->limit_exceeded()); + c1->release(40); + c2->release(60); } TEST(MemTestTest, TrackerHierarchyTryConsume) { - auto p = MemTracker::CreateTracker(100); - auto c1 = MemTracker::CreateTracker(80, "c1", p); - auto c2 = MemTracker::CreateTracker(50, "c2", p); + auto p = MemTracker::create_tracker(100); + auto c1 = MemTracker::create_tracker(80, "c1", p); + auto c2 = MemTracker::create_tracker(50, "c2", p); // everything below limits - bool consumption = c1->TryConsume(60).ok(); + bool consumption = c1->try_consume(60).ok(); EXPECT_EQ(consumption, true); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 0); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 60); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); + EXPECT_FALSE(p->any_limit_exceeded()); // p goes over limit - consumption = c2->TryConsume(50).ok(); + consumption = c2->try_consume(50).ok(); EXPECT_EQ(consumption, false); EXPECT_EQ(c1->consumption(), 60); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 0); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 60); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); + EXPECT_FALSE(p->any_limit_exceeded()); // c2 goes over limit, p drops below limit - c1->Release(20); - c2->Consume(10); + c1->release(20); + c2->consume(10); EXPECT_EQ(c1->consumption(), 40); - EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c1->limit_exceeded()); + EXPECT_FALSE(c1->any_limit_exceeded()); EXPECT_EQ(c2->consumption(), 10); - EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD)); - EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(c2->limit_exceeded()); + EXPECT_FALSE(c2->any_limit_exceeded()); EXPECT_EQ(p->consumption(), 50); - EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD)); + EXPECT_FALSE(p->limit_exceeded()); - c1->Release(40); - c2->Release(10); + c1->release(40); + c2->release(10); } } // end namespace doris diff --git a/be/test/runtime/memory_scratch_sink_test.cpp b/be/test/runtime/memory_scratch_sink_test.cpp index b2443aced80aa7..e20f1023fce790 100644 --- a/be/test/runtime/memory_scratch_sink_test.cpp +++ b/be/test/runtime/memory_scratch_sink_test.cpp @@ -115,7 +115,7 @@ void MemoryScratchSinkTest::init_runtime_state() { _state = new RuntimeState(query_id, query_options, TQueryGlobals(), _env->exec_env()); _state->init_instance_mem_tracker(); _mem_tracker = - MemTracker::CreateTracker(-1, "MemoryScratchSinkTest", _state->instance_mem_tracker()); + MemTracker::create_tracker(-1, "MemoryScratchSinkTest", _state->instance_mem_tracker()); _state->set_desc_tbl(_desc_tbl); _state->_load_dir = "./test_run/output/"; _state->init_mem_trackers(TUniqueId()); diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index eafaaed4badc80..6068e529863614 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -31,15 +31,14 @@ namespace doris { TestEnv::TestEnv() - : _block_mgr_parent_tracker(MemTracker::CreateTracker(-1, "BufferedBlockMgr2")), - _io_mgr_tracker(MemTracker::CreateTracker(-1, "DiskIoMgr")) { + : _block_mgr_parent_tracker(MemTracker::create_tracker(-1, "BufferedBlockMgr2")) { // Some code will use ExecEnv::GetInstance(), so init the global ExecEnv singleton _exec_env = ExecEnv::GetInstance(); _exec_env->_thread_mgr = new ThreadResourceMgr(2); _exec_env->_buffer_reservation = new ReservationTracker(); - _exec_env->_mem_tracker = MemTracker::CreateTracker(-1, "TestEnv"); + _exec_env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10); - _exec_env->disk_io_mgr()->init(_io_mgr_tracker); + _exec_env->disk_io_mgr()->init(-1); _exec_env->_scan_thread_pool = new PriorityThreadPool(1, 16); _exec_env->_result_queue_mgr = new ResultQueueMgr(); // TODO may need rpc support, etc. diff --git a/be/test/util/arrow/arrow_work_flow_test.cpp b/be/test/util/arrow/arrow_work_flow_test.cpp index 658a5ac3a6f698..5a5f2dc36c14e6 100644 --- a/be/test/util/arrow/arrow_work_flow_test.cpp +++ b/be/test/util/arrow/arrow_work_flow_test.cpp @@ -91,6 +91,7 @@ void ArrowWorkFlowTest::init_runtime_state() { _exec_env->_result_queue_mgr = new ResultQueueMgr(); _exec_env->_thread_mgr = new ThreadResourceMgr(); _exec_env->_buffer_reservation = new ReservationTracker(); + _exec_env->_task_pool_mem_tracker_registry.reset(new MemTrackerTaskPool()); TQueryOptions query_options; query_options.batch_size = 1024; TUniqueId query_id; @@ -99,7 +100,7 @@ void ArrowWorkFlowTest::init_runtime_state() { _state = new RuntimeState(query_id, query_options, TQueryGlobals(), _exec_env); _state->init_instance_mem_tracker(); _mem_tracker = - MemTracker::CreateTracker(-1, "ArrowWorkFlowTest", _state->instance_mem_tracker()); + MemTracker::create_tracker(-1, "ArrowWorkFlowTest", _state->instance_mem_tracker()); _state->set_desc_tbl(_desc_tbl); _state->_load_dir = "./test_run/output/"; _state->init_mem_trackers(TUniqueId()); diff --git a/build.sh b/build.sh index 2c883c8ec5b443..a54e866eaef948 100755 --- a/build.sh +++ b/build.sh @@ -217,7 +217,7 @@ fi echo "Build generated code" cd ${DORIS_HOME}/gensrc # DO NOT using parallel make(-j) for gensrc -python --version +/home/disk3/zxy/tools/Python-2.7.10/python --version make # Clean and build Backend diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 98d0c9b0991103..771d767c762676 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1431,16 +1431,36 @@ The size of the buffer before flashing * Default: 3 +### `use_tc_hook` + +* Type: bool +* Description: Whether to initialize TCmalloc new/delete Hook, MemTracker is currently counted in Hook. +* Default: true + ### `mem_tracker_level` * Type: int16 * Description: The level at which MemTracker is displayed on the Web page equal or lower than this level will be displayed on the Web page ``` - RELEASE = 0 - DEBUG = 1 + OVERVIEW = 0 + TASK = 1 + INSTANCE = 2 + VERBOSE = 3 ``` * Default: 0 +### `mem_tracker_consume_min_size_bytes` + +* Type: int32 +* Description: The minimum length of TCMalloc Hook when consume/release MemTracker. Consume size smaller than this value will continue to accumulate to avoid frequent calls to consume/release of MemTracker. Decreasing this value will increase the frequency of consume/release. Increasing this value will cause MemTracker statistics to be inaccurate. Theoretically, the statistical value of a MemTracker differs from the true value = ( mem_tracker_consume_min_size_bytes * the number of BE threads where the MemTracker is located). +* Default: 1048576 + +### `memory_leak_detection` + +* Type: bool +* Description: Whether to start memory leak detection, when MemTracker is a negative value, it is considered that a memory leak has occurred, but the actual MemTracker records inaccurately will also cause a negative value, so this feature is in the experimental stage. +* Default: false + ### `max_segment_num_per_rowset` * Type: int32 diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index 83ce1563586950..07587dd4545f81 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1450,16 +1450,36 @@ webserver默认工作线程数 ``` * 默认值: 3 +### `use_tc_hook` + +* 类型:bool +* 描述:是否初始化TCmalloc new/delete Hook,目前在Hook中统计MemTracker。 +* 默认值:true + ### `mem_tracker_level` * 类型: int16 * 描述: MemTracker在Web页面上展示的级别,等于或低于这个级别的MemTracker会在Web页面上展示 ``` - RELEASE = 0 - DEBUG = 1 + OVERVIEW = 0 + TASK = 1 + INSTANCE = 2 + VERBOSE = 3 ``` * 默认值: 0 +### `mem_tracker_consume_min_size_bytes` + +* 类型: int32 +* 描述: TCMalloc Hook consume/release MemTracker时的最小长度,小于该值的consume size会持续累加,避免频繁调用MemTracker的consume/release,减小该值会增加consume/release的频率,增大该值会导致MemTracker统计不准,理论上一个MemTracker的统计值与真实值相差 = (mem_tracker_consume_min_size_bytes * 这个MemTracker所在的BE线程数)。 +* 默认值: 1048576 + +### `memory_leak_detection` + +* 类型: bool +* 描述: 是否启动内存泄漏检测,当 MemTracker 为负值时认为发生了内存泄漏,但实际 MemTracker 记录不准确时也会导致负值,所以这个功能处于实验阶段。 +* 默认值: false + ### `max_segment_num_per_rowset` * 类型: int32 diff --git a/env.sh b/env.sh index 439de9e0ff3e6a..c4fcb20d4a97bc 100755 --- a/env.sh +++ b/env.sh @@ -42,7 +42,7 @@ if [[ -z ${DORIS_THIRDPARTY} ]]; then fi # check python -export PYTHON=python +export PYTHON=/home/disk3/zxy/tools/Python-2.7.10/python if ! ${PYTHON} --version; then export PYTHON=python2.7 if ! ${PYTHON} --version; then diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java index dd3fcdce1703c8..d01d6f99b362c2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java @@ -534,7 +534,8 @@ private void sendFragment() throws TException, RpcException, UserException { if (colocateFragmentIds.contains(fragment.getFragmentId().asInt())) { int rate = Math.min(Config.query_colocate_join_memory_limit_penalty_factor, instanceNum); long newmemory = memoryLimit / rate; - + // TODO(zxy): The meaning of mem limit in query_options has become the real once query mem limit. + // The logic to modify mem_limit here needs to be modified or deleted. for (TExecPlanFragmentParams tParam : tParams) { tParam.query_options.setMemLimit(newmemory); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index c49134f64ffdab..740c440536b5ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -44,6 +44,7 @@ public class SessionVariable implements Serializable, Writable { static final Logger LOG = LogManager.getLogger(SessionVariable.class); public static final String EXEC_MEM_LIMIT = "exec_mem_limit"; + public static final String QUERY_MEM_LIMIT = "query_mem_limit"; public static final String QUERY_TIMEOUT = "query_timeout"; public static final String ENABLE_PROFILE = "enable_profile"; public static final String SQL_MODE = "sql_mode"; @@ -184,10 +185,14 @@ public class SessionVariable implements Serializable, Writable { @VariableMgr.VarAttr(name = INSERT_VISIBLE_TIMEOUT_MS, needForward = true) public long insertVisibleTimeoutMs = DEFAULT_INSERT_VISIBLE_TIMEOUT_MS; - // max memory used on every backend. + // max instance memory used on every backend. @VariableMgr.VarAttr(name = EXEC_MEM_LIMIT) public long maxExecMemByte = 2147483648L; + // max query memory used on every backend. + @VariableMgr.VarAttr(name = QUERY_MEM_LIMIT) + public long maxQueryMemByte = 0L; + @VariableMgr.VarAttr(name = ENABLE_SPILLING) public boolean enableSpilling = false; @@ -427,6 +432,10 @@ public long getMaxExecMemByte() { return maxExecMemByte; } + public long getMaxQueryMemByte() { + return maxQueryMemByte; + } + public long getLoadMemLimit() { return loadMemLimit; } @@ -553,6 +562,14 @@ public void setMaxExecMemByte(long maxExecMemByte) { } } + public void setMaxQueryMemByte(long maxExecMemByte) { + if (maxExecMemByte < MIN_EXEC_MEM_LIMIT) { + this.maxExecMemByte = MIN_EXEC_MEM_LIMIT; + } else { + this.maxExecMemByte = maxExecMemByte; + } + } + public boolean isSqlQuoteShowCreate() { return sqlQuoteShowCreate; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java index 1031a55c344db8..77d734304add7e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java @@ -18,7 +18,7 @@ package org.apache.doris.qe; import org.apache.doris.catalog.Catalog; -import org.apache.doris.common.FeConstants; +// import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; import org.apache.doris.common.Reference; import org.apache.doris.common.UserException; @@ -192,7 +192,7 @@ public static void addToBlacklist(Long backendID, String reason) { return; } - blacklistBackends.put(backendID, Pair.create(FeConstants.heartbeat_interval_second + 1, reason)); + // blacklistBackends.put(backendID, Pair.create(FeConstants.heartbeat_interval_second + 1, reason)); LOG.warn("add backend {} to black list. reason: {}", backendID, reason); } diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index 41a0dce4bb13ba..76c5f95ec5803e 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -143,6 +143,7 @@ enum PPlanFragmentCancelReason { USER_CANCEL = 2; INTERNAL_ERROR = 3; TIMEOUT = 4; + MEMORY_LIMIT_EXCEED = 5; }; message PCancelPlanFragmentRequest {